Compare commits
1 commit
d73309cb76
...
276812811c
| Author | SHA1 | Date | |
|---|---|---|---|
| 276812811c |
5 changed files with 6 additions and 132 deletions
|
|
@ -50,8 +50,6 @@ public static function getParserForFeed(Feed $feed): ?HomepageParserInterface
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
$language = $feed->language?->short_code ?? 'en';
|
return new $parserClass();
|
||||||
|
|
||||||
return new $parserClass($language);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -6,10 +6,6 @@
|
||||||
|
|
||||||
class BelgaHomepageParserAdapter implements HomepageParserInterface
|
class BelgaHomepageParserAdapter implements HomepageParserInterface
|
||||||
{
|
{
|
||||||
public function __construct(
|
|
||||||
private string $language = 'en',
|
|
||||||
) {}
|
|
||||||
|
|
||||||
public function canParse(string $url): bool
|
public function canParse(string $url): bool
|
||||||
{
|
{
|
||||||
return str_contains($url, 'belganewsagency.eu');
|
return str_contains($url, 'belganewsagency.eu');
|
||||||
|
|
|
||||||
|
|
@ -7,10 +7,10 @@ class VrtHomepageParser
|
||||||
/**
|
/**
|
||||||
* @return array<int, string>
|
* @return array<int, string>
|
||||||
*/
|
*/
|
||||||
public static function extractArticleUrls(string $html, string $language = 'en'): array
|
public static function extractArticleUrls(string $html): array
|
||||||
{
|
{
|
||||||
$escapedLanguage = preg_quote($language, '/');
|
// Extract article links using regex
|
||||||
preg_match_all('/href="(?:https:\/\/www\.vrt\.be)?(\/vrtnws\/' . $escapedLanguage . '\/\d{4}\/\d{2}\/\d{2}\/[^"]+)"/', $html, $matches);
|
preg_match_all('/href="(\/vrtnws\/en\/\d{4}\/\d{2}\/\d{2}\/[^"]+)"/', $html, $matches);
|
||||||
|
|
||||||
$urls = collect($matches[1])
|
$urls = collect($matches[1])
|
||||||
->unique()
|
->unique()
|
||||||
|
|
|
||||||
|
|
@ -6,10 +6,6 @@
|
||||||
|
|
||||||
class VrtHomepageParserAdapter implements HomepageParserInterface
|
class VrtHomepageParserAdapter implements HomepageParserInterface
|
||||||
{
|
{
|
||||||
public function __construct(
|
|
||||||
private string $language = 'en',
|
|
||||||
) {}
|
|
||||||
|
|
||||||
public function canParse(string $url): bool
|
public function canParse(string $url): bool
|
||||||
{
|
{
|
||||||
return str_contains($url, 'vrt.be');
|
return str_contains($url, 'vrt.be');
|
||||||
|
|
@ -17,12 +13,12 @@ public function canParse(string $url): bool
|
||||||
|
|
||||||
public function extractArticleUrls(string $html): array
|
public function extractArticleUrls(string $html): array
|
||||||
{
|
{
|
||||||
return VrtHomepageParser::extractArticleUrls($html, $this->language);
|
return VrtHomepageParser::extractArticleUrls($html);
|
||||||
}
|
}
|
||||||
|
|
||||||
public function getHomepageUrl(): string
|
public function getHomepageUrl(): string
|
||||||
{
|
{
|
||||||
return "https://www.vrt.be/vrtnws/{$this->language}/";
|
return 'https://www.vrt.be/vrtnws/en/';
|
||||||
}
|
}
|
||||||
|
|
||||||
public function getSourceName(): string
|
public function getSourceName(): string
|
||||||
|
|
|
||||||
|
|
@ -1,116 +0,0 @@
|
||||||
<?php
|
|
||||||
|
|
||||||
namespace Tests\Unit\Services\Parsers;
|
|
||||||
|
|
||||||
use App\Services\Parsers\VrtHomepageParser;
|
|
||||||
use PHPUnit\Framework\TestCase;
|
|
||||||
|
|
||||||
class VrtHomepageParserTest extends TestCase
|
|
||||||
{
|
|
||||||
public function test_extracts_english_article_urls_from_relative_links(): void
|
|
||||||
{
|
|
||||||
$html = <<<'HTML'
|
|
||||||
<a href="/vrtnws/en/2026/03/03/da-vinci-botticelli-and-cranach-shine-at-the-bozar/">
|
|
||||||
<img src="https://images.vrt.be/example.jpg" alt="">
|
|
||||||
<span>Culture</span>
|
|
||||||
<h2>Da Vinci, Botticelli and Cranach shine at the Bozar</h2>
|
|
||||||
<time>10 hours ago</time>
|
|
||||||
</a>
|
|
||||||
<a href="/vrtnws/en/2026/03/06/work-to-remove-7-nazi-sea-mines-to-get-underway-on-monday/">
|
|
||||||
<img src="https://images.vrt.be/example2.jpg" alt="">
|
|
||||||
<span>Home News</span>
|
|
||||||
<h2>Work to remove 7 Nazi sea mines to get underway on Monday</h2>
|
|
||||||
<time>Fri 6 Mar</time>
|
|
||||||
</a>
|
|
||||||
HTML;
|
|
||||||
|
|
||||||
$urls = VrtHomepageParser::extractArticleUrls($html, 'en');
|
|
||||||
|
|
||||||
$this->assertCount(2, $urls);
|
|
||||||
$this->assertContains('https://www.vrt.be/vrtnws/en/2026/03/03/da-vinci-botticelli-and-cranach-shine-at-the-bozar/', $urls);
|
|
||||||
$this->assertContains('https://www.vrt.be/vrtnws/en/2026/03/06/work-to-remove-7-nazi-sea-mines-to-get-underway-on-monday/', $urls);
|
|
||||||
}
|
|
||||||
|
|
||||||
public function test_extracts_dutch_article_urls_from_absolute_links(): void
|
|
||||||
{
|
|
||||||
$html = <<<'HTML'
|
|
||||||
<a href="https://www.vrt.be/vrtnws/nl/2026/03/07/cuba-nadert-het-einde-en-zal-snel-onderhandelen-zegt-presiden/">
|
|
||||||
<img src="https://images.vrt.be/example.jpg">
|
|
||||||
<span>Latijns-Amerika</span>
|
|
||||||
<h3>Cuba nadert het einde</h3>
|
|
||||||
<time>1 uur geleden</time>
|
|
||||||
</a>
|
|
||||||
<a href="https://www.vrt.be/vrtnws/nl/2026/03/07/planckendael-aap-ontsnapt/">
|
|
||||||
<img src="https://images.vrt.be/example2.jpg">
|
|
||||||
<span>Binnenland</span>
|
|
||||||
<h3>Goudkopleeuwaapje even ontsnapt</h3>
|
|
||||||
<time>49 minuten geleden</time>
|
|
||||||
</a>
|
|
||||||
HTML;
|
|
||||||
|
|
||||||
$urls = VrtHomepageParser::extractArticleUrls($html, 'nl');
|
|
||||||
|
|
||||||
$this->assertCount(2, $urls);
|
|
||||||
$this->assertContains('https://www.vrt.be/vrtnws/nl/2026/03/07/cuba-nadert-het-einde-en-zal-snel-onderhandelen-zegt-presiden/', $urls);
|
|
||||||
$this->assertContains('https://www.vrt.be/vrtnws/nl/2026/03/07/planckendael-aap-ontsnapt/', $urls);
|
|
||||||
}
|
|
||||||
|
|
||||||
public function test_does_not_extract_urls_for_wrong_language(): void
|
|
||||||
{
|
|
||||||
$html = <<<'HTML'
|
|
||||||
<a href="/vrtnws/en/2026/03/03/some-english-article/">Article</a>
|
|
||||||
HTML;
|
|
||||||
|
|
||||||
$urls = VrtHomepageParser::extractArticleUrls($html, 'nl');
|
|
||||||
|
|
||||||
$this->assertEmpty($urls);
|
|
||||||
}
|
|
||||||
|
|
||||||
public function test_deduplicates_urls(): void
|
|
||||||
{
|
|
||||||
$html = <<<'HTML'
|
|
||||||
<a href="/vrtnws/en/2026/03/03/same-article/">Article</a>
|
|
||||||
<a href="/vrtnws/en/2026/03/03/same-article/">Article again</a>
|
|
||||||
HTML;
|
|
||||||
|
|
||||||
$urls = VrtHomepageParser::extractArticleUrls($html, 'en');
|
|
||||||
|
|
||||||
$this->assertCount(1, $urls);
|
|
||||||
}
|
|
||||||
|
|
||||||
public function test_returns_empty_array_for_html_without_article_links(): void
|
|
||||||
{
|
|
||||||
$html = '<html><body><a href="/about">About</a></body></html>';
|
|
||||||
|
|
||||||
$urls = VrtHomepageParser::extractArticleUrls($html, 'en');
|
|
||||||
|
|
||||||
$this->assertEmpty($urls);
|
|
||||||
}
|
|
||||||
|
|
||||||
public function test_handles_mixed_relative_and_absolute_links(): void
|
|
||||||
{
|
|
||||||
$html = <<<'HTML'
|
|
||||||
<a href="/vrtnws/nl/2026/03/07/relative-article/">Relative</a>
|
|
||||||
<a href="https://www.vrt.be/vrtnws/nl/2026/03/07/absolute-article/">Absolute</a>
|
|
||||||
HTML;
|
|
||||||
|
|
||||||
$urls = VrtHomepageParser::extractArticleUrls($html, 'nl');
|
|
||||||
|
|
||||||
$this->assertCount(2, $urls);
|
|
||||||
$this->assertContains('https://www.vrt.be/vrtnws/nl/2026/03/07/relative-article/', $urls);
|
|
||||||
$this->assertContains('https://www.vrt.be/vrtnws/nl/2026/03/07/absolute-article/', $urls);
|
|
||||||
}
|
|
||||||
|
|
||||||
public function test_defaults_to_english_when_no_language_specified(): void
|
|
||||||
{
|
|
||||||
$html = <<<'HTML'
|
|
||||||
<a href="/vrtnws/en/2026/03/03/test-article/">Test</a>
|
|
||||||
<a href="/vrtnws/nl/2026/03/03/dutch-article/">Dutch</a>
|
|
||||||
HTML;
|
|
||||||
|
|
||||||
$urls = VrtHomepageParser::extractArticleUrls($html);
|
|
||||||
|
|
||||||
$this->assertCount(1, $urls);
|
|
||||||
$this->assertContains('https://www.vrt.be/vrtnws/en/2026/03/03/test-article/', $urls);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Loading…
Reference in a new issue