*/ public static function extractArticleUrls(string $html): array { // Find all relative article links (most articles use relative paths) preg_match_all('/]+href="(\/[a-z0-9-]+)"/', $html, $matches); // Blacklist of non-article paths $blacklistPaths = [ '/', '/de', '/feed', '/search', '/category', '/about', '/contact', '/privacy', '/terms', ]; $urls = collect($matches[1]) ->unique() ->filter(function ($path) use ($blacklistPaths) { // Exclude exact matches and paths starting with blacklisted paths foreach ($blacklistPaths as $blacklistedPath) { if ($path === $blacklistedPath || str_starts_with($path, $blacklistedPath . '/')) { return false; } } return true; }) ->map(function ($path) { // Convert relative paths to absolute URLs return 'https://www.belganewsagency.eu' . $path; }) ->values() ->toArray(); return $urls; } }