fedi-feed-router/backend/src/Domains/Article/Parsers/Belga/BelgaHomepageParser.php
2025-08-15 18:20:19 +02:00

48 lines
No EOL
1.3 KiB
PHP

<?php
namespace Domains\Article\Parsers\Belga;
class BelgaHomepageParser
{
/**
* @return array<int, string>
*/
public static function extractArticleUrls(string $html): array
{
// Find all relative article links (most articles use relative paths)
preg_match_all('/<a[^>]+href="(\/[a-z0-9-]+)"/', $html, $matches);
// Blacklist of non-article paths
$blacklistPaths = [
'/',
'/de',
'/feed',
'/search',
'/category',
'/about',
'/contact',
'/privacy',
'/terms',
];
$urls = collect($matches[1])
->unique()
->filter(function ($path) use ($blacklistPaths) {
// Exclude exact matches and paths starting with blacklisted paths
foreach ($blacklistPaths as $blacklistedPath) {
if ($path === $blacklistedPath || str_starts_with($path, $blacklistedPath . '/')) {
return false;
}
}
return true;
})
->map(function ($path) {
// Convert relative paths to absolute URLs
return 'https://www.belganewsagency.eu' . $path;
})
->values()
->toArray();
return $urls;
}
}