fedi-feed-router/backend/src/Domains/Article/Parsers/Belga/BelgaHomepageParser.php

<?php

namespace Domains\Article\Parsers\Belga;

class BelgaHomepageParser
{
    /**
     * @return array<int, string>
     */
    public static function extractArticleUrls(string $html): array
    {
        // Find all relative article links (most articles use relative paths)
        preg_match_all('/<a[^>]+href="(\/[a-z0-9-]+)"/', $html, $matches);

        // Blacklist of non-article paths
        $blacklistPaths = [
            '/',
            '/de',
            '/feed',
            '/search',
            '/category',
            '/about',
            '/contact',
            '/privacy',
            '/terms',
        ];

        $urls = collect($matches[1])
            ->unique()
            ->filter(function ($path) use ($blacklistPaths) {
                // Exclude exact matches and paths starting with blacklisted paths
                foreach ($blacklistPaths as $blacklistedPath) {
                    if ($path === $blacklistedPath || str_starts_with($path, $blacklistedPath . '/')) {
                        return false;
                    }
                }
                return true;
            })
            ->map(function ($path) {
                // Convert relative paths to absolute URLs
                return 'https://www.belganewsagency.eu' . $path;
            })
            ->values()
            ->toArray();

        return $urls;
    }
}