'; $title = BelgaArticlePageParser::extractTitle($html); $this->assertEquals('Test Article Title', $title); } public function test_extract_title_from_h1_tag(): void { $html = '

H1 Title Test

'; $title = BelgaArticlePageParser::extractTitle($html); $this->assertEquals('H1 Title Test', $title); } public function test_extract_title_from_title_tag(): void { $html = 'Page Title Test'; $title = BelgaArticlePageParser::extractTitle($html); $this->assertEquals('Page Title Test', $title); } public function test_extract_title_with_html_entities(): void { $html = ''; $title = BelgaArticlePageParser::extractTitle($html); $this->assertEquals('Test & Article "Title"', $title); } public function test_extract_title_returns_null_when_not_found(): void { $html = '

No title here

'; $title = BelgaArticlePageParser::extractTitle($html); $this->assertNull($title); } public function test_extract_description_from_og_meta_tag(): void { $html = ''; $description = BelgaArticlePageParser::extractDescription($html); $this->assertEquals('Test article description', $description); } public function test_extract_description_from_paragraph(): void { $html = '

This is the first paragraph description.

'; $description = BelgaArticlePageParser::extractDescription($html); $this->assertEquals('This is the first paragraph description.', $description); } public function test_extract_description_with_html_entities(): void { $html = ''; $description = BelgaArticlePageParser::extractDescription($html); $this->assertEquals('Description with & entities ', $description); } public function test_extract_description_returns_null_when_not_found(): void { $html = '
No description here
'; $description = BelgaArticlePageParser::extractDescription($html); $this->assertNull($description); } public function test_extract_full_article_from_belga_paragraph_class(): void { $html = '

First paragraph content.

Second paragraph content.

This should be ignored.

'; $fullArticle = BelgaArticlePageParser::extractFullArticle($html); $expected = "First paragraph content.\n\nSecond paragraph content."; $this->assertEquals($expected, $fullArticle); } public function test_extract_full_article_filters_empty_paragraphs(): void { $html = '

Content paragraph.

Another content paragraph.

'; $fullArticle = BelgaArticlePageParser::extractFullArticle($html); $expected = "Content paragraph.\n\nAnother content paragraph."; $this->assertEquals($expected, $fullArticle); } public function test_extract_full_article_handles_nested_tags(): void { $html = '

This has bold text and italic text.

This has a link inside.

'; $fullArticle = BelgaArticlePageParser::extractFullArticle($html); $expected = "This has bold text and italic text.\n\nThis has a link inside."; $this->assertEquals($expected, $fullArticle); } public function test_extract_full_article_removes_scripts_and_styles(): void { $html = '

Clean content.

'; $fullArticle = BelgaArticlePageParser::extractFullArticle($html); $this->assertEquals('Clean content.', $fullArticle); $this->assertStringNotContainsString('console.log', $fullArticle); $this->assertStringNotContainsString('alert', $fullArticle); $this->assertStringNotContainsString('color: red', $fullArticle); } public function test_extract_full_article_fallback_to_prezly_document(): void { $html = '

Content from prezly section.

More prezly content.

'; $fullArticle = BelgaArticlePageParser::extractFullArticle($html); $expected = "Content from prezly section.\n\nMore prezly content."; $this->assertEquals($expected, $fullArticle); } public function test_extract_full_article_fallback_to_all_paragraphs(): void { $html = '

First general paragraph.

Second general paragraph.

'; $fullArticle = BelgaArticlePageParser::extractFullArticle($html); $expected = "First general paragraph.\n\nSecond general paragraph."; $this->assertEquals($expected, $fullArticle); } public function test_extract_full_article_returns_null_when_no_content(): void { $html = '
No paragraphs here
'; $fullArticle = BelgaArticlePageParser::extractFullArticle($html); $this->assertNull($fullArticle); } public function test_extract_thumbnail_from_og_image(): void { $html = ''; $thumbnail = BelgaArticlePageParser::extractThumbnail($html); $this->assertEquals('https://example.com/image.jpg', $thumbnail); } public function test_extract_thumbnail_from_img_tag(): void { $html = 'test'; $thumbnail = BelgaArticlePageParser::extractThumbnail($html); $this->assertEquals('https://example.com/article-image.png', $thumbnail); } public function test_extract_thumbnail_prefers_og_image(): void { $html = ' test '; $thumbnail = BelgaArticlePageParser::extractThumbnail($html); $this->assertEquals('https://example.com/og-image.jpg', $thumbnail); } public function test_extract_thumbnail_returns_null_when_not_found(): void { $html = '
No images here
'; $thumbnail = BelgaArticlePageParser::extractThumbnail($html); $this->assertNull($thumbnail); } public function test_extract_data_returns_all_components(): void { $html = '

Full article content here.

'; $data = BelgaArticlePageParser::extractData($html); $this->assertIsArray($data); $this->assertArrayHasKey('title', $data); $this->assertArrayHasKey('description', $data); $this->assertArrayHasKey('full_article', $data); $this->assertArrayHasKey('thumbnail', $data); $this->assertEquals('Test Article', $data['title']); $this->assertEquals('Test description', $data['description']); $this->assertEquals('Full article content here.', $data['full_article']); $this->assertEquals('https://example.com/image.jpg', $data['thumbnail']); } public function test_extract_data_handles_missing_components_gracefully(): void { $html = '
Minimal content
'; $data = BelgaArticlePageParser::extractData($html); $this->assertIsArray($data); $this->assertArrayHasKey('title', $data); $this->assertArrayHasKey('description', $data); $this->assertArrayHasKey('full_article', $data); $this->assertArrayHasKey('thumbnail', $data); $this->assertNull($data['title']); $this->assertNull($data['description']); $this->assertNull($data['full_article']); $this->assertNull($data['thumbnail']); } /** * Test based on actual Belga HTML structure from real article */ public function test_extract_full_article_with_realistic_belga_html(): void { $html = '

Around 110,000 people joined the Antwerp Pride Parade on Saturday afternoon, according to police.

The event passed without major incidents. Earlier in the day, far-right group Voorpost held a pre-approved protest.

Police say they expect no problems with crowd dispersal, as departures will be staggered.

'; $fullArticle = BelgaArticlePageParser::extractFullArticle($html); $this->assertNotNull($fullArticle); $this->assertStringContainsString('110,000 people joined', $fullArticle); $this->assertStringContainsString('major incidents', $fullArticle); $this->assertStringContainsString('crowd dispersal', $fullArticle); // Should join paragraphs with double newlines $this->assertStringContainsString("\n\n", $fullArticle); // Should strip HTML tags $this->assertStringNotContainsString('', $fullArticle); $this->assertStringNotContainsString('', $fullArticle); } }