123 lines
No EOL
4.5 KiB
PHP
123 lines
No EOL
4.5 KiB
PHP
<?php
|
|
|
|
namespace Tests\Unit\Services\Parsers;
|
|
|
|
use App\Services\Parsers\VrtHomepageParser;
|
|
use PHPUnit\Framework\TestCase;
|
|
|
|
class VrtHomepageParserTest extends TestCase
|
|
{
|
|
public function test_extracts_english_article_urls_from_relative_links(): void
|
|
{
|
|
$html = <<<'HTML'
|
|
<a href="/vrtnws/en/2026/03/03/da-vinci-botticelli-and-cranach-shine-at-the-bozar/">
|
|
<img src="https://images.vrt.be/example.jpg" alt="">
|
|
<span>Culture</span>
|
|
<h2>Da Vinci, Botticelli and Cranach shine at the Bozar</h2>
|
|
<time>10 hours ago</time>
|
|
</a>
|
|
<a href="/vrtnws/en/2026/03/06/work-to-remove-7-nazi-sea-mines-to-get-underway-on-monday/">
|
|
<img src="https://images.vrt.be/example2.jpg" alt="">
|
|
<span>Home News</span>
|
|
<h2>Work to remove 7 Nazi sea mines to get underway on Monday</h2>
|
|
<time>Fri 6 Mar</time>
|
|
</a>
|
|
HTML;
|
|
|
|
$urls = VrtHomepageParser::extractArticleUrls($html, 'en');
|
|
|
|
$this->assertCount(2, $urls);
|
|
$this->assertContains('https://www.vrt.be/vrtnws/en/2026/03/03/da-vinci-botticelli-and-cranach-shine-at-the-bozar/', $urls);
|
|
$this->assertContains('https://www.vrt.be/vrtnws/en/2026/03/06/work-to-remove-7-nazi-sea-mines-to-get-underway-on-monday/', $urls);
|
|
}
|
|
|
|
public function test_extracts_dutch_article_urls_from_absolute_links(): void
|
|
{
|
|
$html = <<<'HTML'
|
|
<a href="https://www.vrt.be/vrtnws/nl/2026/03/07/cuba-nadert-het-einde-en-zal-snel-onderhandelen-zegt-presiden/">
|
|
<img src="https://images.vrt.be/example.jpg">
|
|
<span>Latijns-Amerika</span>
|
|
<h3>Cuba nadert het einde</h3>
|
|
<time>1 uur geleden</time>
|
|
</a>
|
|
<a href="https://www.vrt.be/vrtnws/nl/2026/03/07/planckendael-aap-ontsnapt/">
|
|
<img src="https://images.vrt.be/example2.jpg">
|
|
<span>Binnenland</span>
|
|
<h3>Goudkopleeuwaapje even ontsnapt</h3>
|
|
<time>49 minuten geleden</time>
|
|
</a>
|
|
HTML;
|
|
|
|
$urls = VrtHomepageParser::extractArticleUrls($html, 'nl');
|
|
|
|
$this->assertCount(2, $urls);
|
|
$this->assertContains('https://www.vrt.be/vrtnws/nl/2026/03/07/cuba-nadert-het-einde-en-zal-snel-onderhandelen-zegt-presiden/', $urls);
|
|
$this->assertContains('https://www.vrt.be/vrtnws/nl/2026/03/07/planckendael-aap-ontsnapt/', $urls);
|
|
}
|
|
|
|
public function test_does_not_extract_urls_for_wrong_language(): void
|
|
{
|
|
$html = <<<'HTML'
|
|
<a href="/vrtnws/en/2026/03/03/some-english-article/">Article</a>
|
|
HTML;
|
|
|
|
$urls = VrtHomepageParser::extractArticleUrls($html, 'nl');
|
|
|
|
$this->assertEmpty($urls);
|
|
}
|
|
|
|
public function test_deduplicates_urls(): void
|
|
{
|
|
$html = <<<'HTML'
|
|
<a href="/vrtnws/en/2026/03/03/same-article/">Article</a>
|
|
<a href="/vrtnws/en/2026/03/03/same-article/">Article again</a>
|
|
HTML;
|
|
|
|
$urls = VrtHomepageParser::extractArticleUrls($html, 'en');
|
|
|
|
$this->assertCount(1, $urls);
|
|
}
|
|
|
|
public function test_returns_empty_array_for_html_without_article_links(): void
|
|
{
|
|
$html = '<html><body><a href="/about">About</a></body></html>';
|
|
|
|
$urls = VrtHomepageParser::extractArticleUrls($html, 'en');
|
|
|
|
$this->assertEmpty($urls);
|
|
}
|
|
|
|
public function test_handles_mixed_relative_and_absolute_links(): void
|
|
{
|
|
$html = <<<'HTML'
|
|
<a href="/vrtnws/nl/2026/03/07/relative-article/">Relative</a>
|
|
<a href="https://www.vrt.be/vrtnws/nl/2026/03/07/absolute-article/">Absolute</a>
|
|
HTML;
|
|
|
|
$urls = VrtHomepageParser::extractArticleUrls($html, 'nl');
|
|
|
|
$this->assertCount(2, $urls);
|
|
$this->assertContains('https://www.vrt.be/vrtnws/nl/2026/03/07/relative-article/', $urls);
|
|
$this->assertContains('https://www.vrt.be/vrtnws/nl/2026/03/07/absolute-article/', $urls);
|
|
}
|
|
|
|
public function test_defaults_to_english_when_no_language_specified(): void
|
|
{
|
|
$html = <<<'HTML'
|
|
<a href="/vrtnws/en/2026/03/03/test-article/">Test</a>
|
|
<a href="/vrtnws/nl/2026/03/03/dutch-article/">Dutch</a>
|
|
HTML;
|
|
|
|
$urls = VrtHomepageParser::extractArticleUrls($html);
|
|
|
|
$this->assertCount(1, $urls);
|
|
$this->assertContains('https://www.vrt.be/vrtnws/en/2026/03/03/test-article/', $urls);
|
|
}
|
|
|
|
public function test_returns_empty_array_for_empty_html(): void
|
|
{
|
|
$urls = VrtHomepageParser::extractArticleUrls('', 'en');
|
|
|
|
$this->assertEmpty($urls);
|
|
}
|
|
} |