fedi-feed-router/backend/src/Domains/Article/Parsers/Belga/BelgaHomepageParser.php

48 lines
1.3 KiB
PHP
Raw Normal View History

2025-06-29 21:39:28 +02:00
<?php
2025-08-15 16:39:18 +02:00
namespace Domains\Article\Parsers\Belga;
2025-06-29 21:39:28 +02:00
class BelgaHomepageParser
{
2025-07-07 00:51:32 +02:00
/**
* @return array<int, string>
*/
2025-06-29 21:39:28 +02:00
public static function extractArticleUrls(string $html): array
{
2025-08-09 18:34:19 +02:00
// Find all relative article links (most articles use relative paths)
preg_match_all('/<a[^>]+href="(\/[a-z0-9-]+)"/', $html, $matches);
// Blacklist of non-article paths
$blacklistPaths = [
'/',
'/de',
'/feed',
'/search',
'/category',
'/about',
'/contact',
'/privacy',
'/terms',
];
2025-06-29 21:39:28 +02:00
2025-07-06 20:45:40 +02:00
$urls = collect($matches[1])
2025-06-29 21:39:28 +02:00
->unique()
2025-08-09 18:34:19 +02:00
->filter(function ($path) use ($blacklistPaths) {
// Exclude exact matches and paths starting with blacklisted paths
foreach ($blacklistPaths as $blacklistedPath) {
if ($path === $blacklistedPath || str_starts_with($path, $blacklistedPath . '/')) {
return false;
}
}
return true;
})
->map(function ($path) {
// Convert relative paths to absolute URLs
return 'https://www.belganewsagency.eu' . $path;
})
->values()
2025-06-29 21:39:28 +02:00
->toArray();
return $urls;
}
}