2025-06-29 21:39:28 +02:00
|
|
|
<?php
|
|
|
|
|
|
2025-08-15 16:39:18 +02:00
|
|
|
namespace Domains\Article\Parsers\Belga;
|
2025-06-29 21:39:28 +02:00
|
|
|
|
|
|
|
|
class BelgaHomepageParser
|
|
|
|
|
{
|
2025-07-07 00:51:32 +02:00
|
|
|
/**
|
|
|
|
|
* @return array<int, string>
|
|
|
|
|
*/
|
2025-06-29 21:39:28 +02:00
|
|
|
public static function extractArticleUrls(string $html): array
|
|
|
|
|
{
|
2025-08-09 18:34:19 +02:00
|
|
|
// Find all relative article links (most articles use relative paths)
|
|
|
|
|
preg_match_all('/<a[^>]+href="(\/[a-z0-9-]+)"/', $html, $matches);
|
|
|
|
|
|
|
|
|
|
// Blacklist of non-article paths
|
|
|
|
|
$blacklistPaths = [
|
|
|
|
|
'/',
|
|
|
|
|
'/de',
|
|
|
|
|
'/feed',
|
|
|
|
|
'/search',
|
|
|
|
|
'/category',
|
|
|
|
|
'/about',
|
|
|
|
|
'/contact',
|
|
|
|
|
'/privacy',
|
|
|
|
|
'/terms',
|
|
|
|
|
];
|
2025-06-29 21:39:28 +02:00
|
|
|
|
2025-07-06 20:45:40 +02:00
|
|
|
$urls = collect($matches[1])
|
2025-06-29 21:39:28 +02:00
|
|
|
->unique()
|
2025-08-09 18:34:19 +02:00
|
|
|
->filter(function ($path) use ($blacklistPaths) {
|
|
|
|
|
// Exclude exact matches and paths starting with blacklisted paths
|
|
|
|
|
foreach ($blacklistPaths as $blacklistedPath) {
|
|
|
|
|
if ($path === $blacklistedPath || str_starts_with($path, $blacklistedPath . '/')) {
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return true;
|
|
|
|
|
})
|
|
|
|
|
->map(function ($path) {
|
|
|
|
|
// Convert relative paths to absolute URLs
|
|
|
|
|
return 'https://www.belganewsagency.eu' . $path;
|
|
|
|
|
})
|
|
|
|
|
->values()
|
2025-06-29 21:39:28 +02:00
|
|
|
->toArray();
|
|
|
|
|
|
|
|
|
|
return $urls;
|
|
|
|
|
}
|
|
|
|
|
}
|