fedi-feed-router/backend/src/Domains/Article/Parsers/Belga/BelgaHomepageParser.php

<?php

namespace Domains\Article\Parsers\Belga;

class BelgaHomepageParser
{
    /**
     * @return array<int, string>
     */
    public static function extractArticleUrls(string $html): array
    {
        // Find all relative article links (most articles use relative paths)
        preg_match_all('/<a[^>]+href="(\/[a-z0-9-]+)"/', $html, $matches);

        // Blacklist of non-article paths
        $blacklistPaths = [
            '/',
            '/de',
            '/feed',
            '/search',
            '/category',
            '/about',
            '/contact',
            '/privacy',
            '/terms',
        ];

        $urls = collect($matches[1])
            ->unique()
            ->filter(function ($path) use ($blacklistPaths) {
                // Exclude exact matches and paths starting with blacklisted paths
                foreach ($blacklistPaths as $blacklistedPath) {
                    if ($path === $blacklistedPath || str_starts_with($path, $blacklistedPath . '/')) {
                        return false;
                    }
                }
                return true;
            })
            ->map(function ($path) {
                // Convert relative paths to absolute URLs
                return 'https://www.belganewsagency.eu' . $path;
            })
            ->values()
            ->toArray();

        return $urls;
    }
}
Add multiple sources 2025-06-29 21:39:28 +02:00			`<?php`

Apply domain structure 2025-08-15 16:39:18 +02:00			`namespace Domains\Article\Parsers\Belga;`
Add multiple sources 2025-06-29 21:39:28 +02:00
			`class BelgaHomepageParser`
			`{`
Raise PHPStan level tp 6 2025-07-07 00:51:32 +02:00			`/**`
			`* @return array<int, string>`
			`*/`
Add multiple sources 2025-06-29 21:39:28 +02:00			`public static function extractArticleUrls(string $html): array`
			`{`
Fix article fetching 2025-08-09 18:34:19 +02:00			`// Find all relative article links (most articles use relative paths)`
			`preg_match_all('/<a[^>]+href="(\/[a-z0-9-]+)"/', $html, $matches);`

			`// Blacklist of non-article paths`
			`$blacklistPaths = [`
			`'/',`
			`'/de',`
			`'/feed',`
			`'/search',`
			`'/category',`
			`'/about',`
			`'/contact',`
			`'/privacy',`
			`'/terms',`
			`];`
Add multiple sources 2025-06-29 21:39:28 +02:00
Raise PHPStan level tp 4 2025-07-06 20:45:40 +02:00			`$urls = collect($matches[1])`
Add multiple sources 2025-06-29 21:39:28 +02:00			`->unique()`
Fix article fetching 2025-08-09 18:34:19 +02:00			`->filter(function ($path) use ($blacklistPaths) {`
			`// Exclude exact matches and paths starting with blacklisted paths`
			`foreach ($blacklistPaths as $blacklistedPath) {`
			`if ($path === $blacklistedPath \|\| str_starts_with($path, $blacklistedPath . '/')) {`
			`return false;`
			`}`
			`}`
			`return true;`
			`})`
			`->map(function ($path) {`
			`// Convert relative paths to absolute URLs`
			`return 'https://www.belganewsagency.eu' . $path;`
			`})`
			`->values()`
Add multiple sources 2025-06-29 21:39:28 +02:00			`->toArray();`

			`return $urls;`
			`}`
			`}`