77 - Fix VRT homepage parser language support

This commit is contained in:
myrmidex 2026-03-07 18:02:27 +01:00
parent 866f8d02d3
commit 3ed49cfbbe
5 changed files with 139 additions and 33 deletions

View file

@ -4,36 +4,9 @@
use App\Contracts\HomepageParserInterface;
use App\Models\Feed;
use App\Services\Parsers\VrtHomepageParserAdapter;
use App\Services\Parsers\BelgaHomepageParserAdapter;
use Exception;
class HomepageParserFactory
{
/**
* @var array<int, class-string<HomepageParserInterface>>
*/
private static array $parsers = [
VrtHomepageParserAdapter::class,
BelgaHomepageParserAdapter::class,
];
/**
* @throws Exception
*/
public static function getParser(string $url): HomepageParserInterface
{
foreach (self::$parsers as $parserClass) {
$parser = new $parserClass();
if ($parser->canParse($url)) {
return $parser;
}
}
throw new Exception("No homepage parser found for URL: {$url}");
}
public static function getParserForFeed(Feed $feed): ?HomepageParserInterface
{
if (!$feed->provider) {
@ -50,6 +23,8 @@ public static function getParserForFeed(Feed $feed): ?HomepageParserInterface
return null;
}
return new $parserClass();
$language = $feed->language?->short_code ?? 'en';
return new $parserClass($language);
}
}

View file

@ -6,6 +6,10 @@
class BelgaHomepageParserAdapter implements HomepageParserInterface
{
public function __construct(
private string $language = 'en',
) {}
public function canParse(string $url): bool
{
return str_contains($url, 'belganewsagency.eu');

View file

@ -7,10 +7,10 @@ class VrtHomepageParser
/**
* @return array<int, string>
*/
public static function extractArticleUrls(string $html): array
public static function extractArticleUrls(string $html, string $language = 'en'): array
{
// Extract article links using regex
preg_match_all('/href="(\/vrtnws\/en\/\d{4}\/\d{2}\/\d{2}\/[^"]+)"/', $html, $matches);
$escapedLanguage = preg_quote($language, '/');
preg_match_all('/href="(?:https:\/\/www\.vrt\.be)?(\/vrtnws\/' . $escapedLanguage . '\/\d{4}\/\d{2}\/\d{2}\/[^"]+)"/', $html, $matches);
$urls = collect($matches[1])
->unique()

View file

@ -6,6 +6,10 @@
class VrtHomepageParserAdapter implements HomepageParserInterface
{
public function __construct(
private string $language = 'en',
) {}
public function canParse(string $url): bool
{
return str_contains($url, 'vrt.be');
@ -13,12 +17,12 @@ public function canParse(string $url): bool
public function extractArticleUrls(string $html): array
{
return VrtHomepageParser::extractArticleUrls($html);
return VrtHomepageParser::extractArticleUrls($html, $this->language);
}
public function getHomepageUrl(): string
{
return 'https://www.vrt.be/vrtnws/en/';
return "https://www.vrt.be/vrtnws/{$this->language}/";
}
public function getSourceName(): string

View file

@ -0,0 +1,123 @@
<?php
namespace Tests\Unit\Services\Parsers;
use App\Services\Parsers\VrtHomepageParser;
use PHPUnit\Framework\TestCase;
class VrtHomepageParserTest extends TestCase
{
public function test_extracts_english_article_urls_from_relative_links(): void
{
$html = <<<'HTML'
<a href="/vrtnws/en/2026/03/03/da-vinci-botticelli-and-cranach-shine-at-the-bozar/">
<img src="https://images.vrt.be/example.jpg" alt="">
<span>Culture</span>
<h2>Da Vinci, Botticelli and Cranach shine at the Bozar</h2>
<time>10 hours ago</time>
</a>
<a href="/vrtnws/en/2026/03/06/work-to-remove-7-nazi-sea-mines-to-get-underway-on-monday/">
<img src="https://images.vrt.be/example2.jpg" alt="">
<span>Home News</span>
<h2>Work to remove 7 Nazi sea mines to get underway on Monday</h2>
<time>Fri 6 Mar</time>
</a>
HTML;
$urls = VrtHomepageParser::extractArticleUrls($html, 'en');
$this->assertCount(2, $urls);
$this->assertContains('https://www.vrt.be/vrtnws/en/2026/03/03/da-vinci-botticelli-and-cranach-shine-at-the-bozar/', $urls);
$this->assertContains('https://www.vrt.be/vrtnws/en/2026/03/06/work-to-remove-7-nazi-sea-mines-to-get-underway-on-monday/', $urls);
}
public function test_extracts_dutch_article_urls_from_absolute_links(): void
{
$html = <<<'HTML'
<a href="https://www.vrt.be/vrtnws/nl/2026/03/07/cuba-nadert-het-einde-en-zal-snel-onderhandelen-zegt-presiden/">
<img src="https://images.vrt.be/example.jpg">
<span>Latijns-Amerika</span>
<h3>Cuba nadert het einde</h3>
<time>1 uur geleden</time>
</a>
<a href="https://www.vrt.be/vrtnws/nl/2026/03/07/planckendael-aap-ontsnapt/">
<img src="https://images.vrt.be/example2.jpg">
<span>Binnenland</span>
<h3>Goudkopleeuwaapje even ontsnapt</h3>
<time>49 minuten geleden</time>
</a>
HTML;
$urls = VrtHomepageParser::extractArticleUrls($html, 'nl');
$this->assertCount(2, $urls);
$this->assertContains('https://www.vrt.be/vrtnws/nl/2026/03/07/cuba-nadert-het-einde-en-zal-snel-onderhandelen-zegt-presiden/', $urls);
$this->assertContains('https://www.vrt.be/vrtnws/nl/2026/03/07/planckendael-aap-ontsnapt/', $urls);
}
public function test_does_not_extract_urls_for_wrong_language(): void
{
$html = <<<'HTML'
<a href="/vrtnws/en/2026/03/03/some-english-article/">Article</a>
HTML;
$urls = VrtHomepageParser::extractArticleUrls($html, 'nl');
$this->assertEmpty($urls);
}
public function test_deduplicates_urls(): void
{
$html = <<<'HTML'
<a href="/vrtnws/en/2026/03/03/same-article/">Article</a>
<a href="/vrtnws/en/2026/03/03/same-article/">Article again</a>
HTML;
$urls = VrtHomepageParser::extractArticleUrls($html, 'en');
$this->assertCount(1, $urls);
}
public function test_returns_empty_array_for_html_without_article_links(): void
{
$html = '<html><body><a href="/about">About</a></body></html>';
$urls = VrtHomepageParser::extractArticleUrls($html, 'en');
$this->assertEmpty($urls);
}
public function test_handles_mixed_relative_and_absolute_links(): void
{
$html = <<<'HTML'
<a href="/vrtnws/nl/2026/03/07/relative-article/">Relative</a>
<a href="https://www.vrt.be/vrtnws/nl/2026/03/07/absolute-article/">Absolute</a>
HTML;
$urls = VrtHomepageParser::extractArticleUrls($html, 'nl');
$this->assertCount(2, $urls);
$this->assertContains('https://www.vrt.be/vrtnws/nl/2026/03/07/relative-article/', $urls);
$this->assertContains('https://www.vrt.be/vrtnws/nl/2026/03/07/absolute-article/', $urls);
}
public function test_defaults_to_english_when_no_language_specified(): void
{
$html = <<<'HTML'
<a href="/vrtnws/en/2026/03/03/test-article/">Test</a>
<a href="/vrtnws/nl/2026/03/03/dutch-article/">Dutch</a>
HTML;
$urls = VrtHomepageParser::extractArticleUrls($html);
$this->assertCount(1, $urls);
$this->assertContains('https://www.vrt.be/vrtnws/en/2026/03/03/test-article/', $urls);
}
public function test_returns_empty_array_for_empty_html(): void
{
$urls = VrtHomepageParser::extractArticleUrls('', 'en');
$this->assertEmpty($urls);
}
}