diff --git a/app/Services/Factories/HomepageParserFactory.php b/app/Services/Factories/HomepageParserFactory.php index 7215961..547836e 100644 --- a/app/Services/Factories/HomepageParserFactory.php +++ b/app/Services/Factories/HomepageParserFactory.php @@ -4,36 +4,9 @@ use App\Contracts\HomepageParserInterface; use App\Models\Feed; -use App\Services\Parsers\VrtHomepageParserAdapter; -use App\Services\Parsers\BelgaHomepageParserAdapter; -use Exception; class HomepageParserFactory { - /** - * @var array> - */ - private static array $parsers = [ - VrtHomepageParserAdapter::class, - BelgaHomepageParserAdapter::class, - ]; - - /** - * @throws Exception - */ - public static function getParser(string $url): HomepageParserInterface - { - foreach (self::$parsers as $parserClass) { - $parser = new $parserClass(); - - if ($parser->canParse($url)) { - return $parser; - } - } - - throw new Exception("No homepage parser found for URL: {$url}"); - } - public static function getParserForFeed(Feed $feed): ?HomepageParserInterface { if (!$feed->provider) { @@ -50,6 +23,8 @@ public static function getParserForFeed(Feed $feed): ?HomepageParserInterface return null; } - return new $parserClass(); + $language = $feed->language?->short_code ?? 'en'; + + return new $parserClass($language); } } diff --git a/app/Services/Parsers/BelgaHomepageParserAdapter.php b/app/Services/Parsers/BelgaHomepageParserAdapter.php index 4613e76..f3ba438 100644 --- a/app/Services/Parsers/BelgaHomepageParserAdapter.php +++ b/app/Services/Parsers/BelgaHomepageParserAdapter.php @@ -6,6 +6,10 @@ class BelgaHomepageParserAdapter implements HomepageParserInterface { + public function __construct( + private string $language = 'en', + ) {} + public function canParse(string $url): bool { return str_contains($url, 'belganewsagency.eu'); diff --git a/app/Services/Parsers/VrtHomepageParser.php b/app/Services/Parsers/VrtHomepageParser.php index 8ca4d96..8aa5b17 100644 --- a/app/Services/Parsers/VrtHomepageParser.php +++ b/app/Services/Parsers/VrtHomepageParser.php @@ -7,10 +7,10 @@ class VrtHomepageParser /** * @return array */ - public static function extractArticleUrls(string $html): array + public static function extractArticleUrls(string $html, string $language = 'en'): array { - // Extract article links using regex - preg_match_all('/href="(\/vrtnws\/en\/\d{4}\/\d{2}\/\d{2}\/[^"]+)"/', $html, $matches); + $escapedLanguage = preg_quote($language, '/'); + preg_match_all('/href="(?:https:\/\/www\.vrt\.be)?(\/vrtnws\/' . $escapedLanguage . '\/\d{4}\/\d{2}\/\d{2}\/[^"]+)"/', $html, $matches); $urls = collect($matches[1]) ->unique() diff --git a/app/Services/Parsers/VrtHomepageParserAdapter.php b/app/Services/Parsers/VrtHomepageParserAdapter.php index 3d1db3a..557bb44 100644 --- a/app/Services/Parsers/VrtHomepageParserAdapter.php +++ b/app/Services/Parsers/VrtHomepageParserAdapter.php @@ -6,6 +6,10 @@ class VrtHomepageParserAdapter implements HomepageParserInterface { + public function __construct( + private string $language = 'en', + ) {} + public function canParse(string $url): bool { return str_contains($url, 'vrt.be'); @@ -13,12 +17,12 @@ public function canParse(string $url): bool public function extractArticleUrls(string $html): array { - return VrtHomepageParser::extractArticleUrls($html); + return VrtHomepageParser::extractArticleUrls($html, $this->language); } public function getHomepageUrl(): string { - return 'https://www.vrt.be/vrtnws/en/'; + return "https://www.vrt.be/vrtnws/{$this->language}/"; } public function getSourceName(): string diff --git a/tests/Unit/Services/Parsers/VrtHomepageParserTest.php b/tests/Unit/Services/Parsers/VrtHomepageParserTest.php new file mode 100644 index 0000000..303f86d --- /dev/null +++ b/tests/Unit/Services/Parsers/VrtHomepageParserTest.php @@ -0,0 +1,123 @@ + + + Culture +

Da Vinci, Botticelli and Cranach shine at the Bozar

+ + + + + Home News +

Work to remove 7 Nazi sea mines to get underway on Monday

+ +
+ HTML; + + $urls = VrtHomepageParser::extractArticleUrls($html, 'en'); + + $this->assertCount(2, $urls); + $this->assertContains('https://www.vrt.be/vrtnws/en/2026/03/03/da-vinci-botticelli-and-cranach-shine-at-the-bozar/', $urls); + $this->assertContains('https://www.vrt.be/vrtnws/en/2026/03/06/work-to-remove-7-nazi-sea-mines-to-get-underway-on-monday/', $urls); + } + + public function test_extracts_dutch_article_urls_from_absolute_links(): void + { + $html = <<<'HTML' + + + Latijns-Amerika +

Cuba nadert het einde

+ +
+ + + Binnenland +

Goudkopleeuwaapje even ontsnapt

+ +
+ HTML; + + $urls = VrtHomepageParser::extractArticleUrls($html, 'nl'); + + $this->assertCount(2, $urls); + $this->assertContains('https://www.vrt.be/vrtnws/nl/2026/03/07/cuba-nadert-het-einde-en-zal-snel-onderhandelen-zegt-presiden/', $urls); + $this->assertContains('https://www.vrt.be/vrtnws/nl/2026/03/07/planckendael-aap-ontsnapt/', $urls); + } + + public function test_does_not_extract_urls_for_wrong_language(): void + { + $html = <<<'HTML' + Article + HTML; + + $urls = VrtHomepageParser::extractArticleUrls($html, 'nl'); + + $this->assertEmpty($urls); + } + + public function test_deduplicates_urls(): void + { + $html = <<<'HTML' + Article + Article again + HTML; + + $urls = VrtHomepageParser::extractArticleUrls($html, 'en'); + + $this->assertCount(1, $urls); + } + + public function test_returns_empty_array_for_html_without_article_links(): void + { + $html = 'About'; + + $urls = VrtHomepageParser::extractArticleUrls($html, 'en'); + + $this->assertEmpty($urls); + } + + public function test_handles_mixed_relative_and_absolute_links(): void + { + $html = <<<'HTML' + Relative + Absolute + HTML; + + $urls = VrtHomepageParser::extractArticleUrls($html, 'nl'); + + $this->assertCount(2, $urls); + $this->assertContains('https://www.vrt.be/vrtnws/nl/2026/03/07/relative-article/', $urls); + $this->assertContains('https://www.vrt.be/vrtnws/nl/2026/03/07/absolute-article/', $urls); + } + + public function test_defaults_to_english_when_no_language_specified(): void + { + $html = <<<'HTML' + Test + Dutch + HTML; + + $urls = VrtHomepageParser::extractArticleUrls($html); + + $this->assertCount(1, $urls); + $this->assertContains('https://www.vrt.be/vrtnws/en/2026/03/03/test-article/', $urls); + } + + public function test_returns_empty_array_for_empty_html(): void + { + $urls = VrtHomepageParser::extractArticleUrls('', 'en'); + + $this->assertEmpty($urls); + } +} \ No newline at end of file