'; $result = VrtArticlePageParser::extractTitle($html); $this->assertEquals('VRT News Article Title', $result); } public function test_extract_title_returns_h1_when_og_title_not_present(): void { $html = '

Main Article Heading

'; $result = VrtArticlePageParser::extractTitle($html); $this->assertEquals('Main Article Heading', $result); } public function test_extract_title_returns_title_tag_when_og_title_and_h1_not_present(): void { $html = 'Page Title'; $result = VrtArticlePageParser::extractTitle($html); $this->assertEquals('Page Title', $result); } public function test_extract_title_decodes_html_entities(): void { $html = ''; $result = VrtArticlePageParser::extractTitle($html); $this->assertEquals('Title with & special "chars"', $result); } public function test_extract_title_handles_h1_content_with_attributes(): void { $html = '

Simple H1 Title

'; $result = VrtArticlePageParser::extractTitle($html); $this->assertEquals('Simple H1 Title', $result); } public function test_extract_title_handles_h1_with_nested_tags(): void { // Should extract content from h1 and strip nested tags $html = '

Title with nested tags

'; $result = VrtArticlePageParser::extractTitle($html); // Should extract and strip tags to get clean text $this->assertEquals('Title with nested tags', $result); } public function test_extract_title_returns_null_when_none_found(): void { $html = '

No title tags here

'; $result = VrtArticlePageParser::extractTitle($html); $this->assertNull($result); } public function test_extract_description_returns_og_description_when_present(): void { $html = ''; $result = VrtArticlePageParser::extractDescription($html); $this->assertEquals('This is the article description', $result); } public function test_extract_description_returns_first_paragraph_when_og_description_not_present(): void { $html = '

This is the first paragraph content.

Second paragraph.

'; $result = VrtArticlePageParser::extractDescription($html); $this->assertEquals('This is the first paragraph content.', $result); } public function test_extract_description_decodes_html_entities(): void { $html = ''; $result = VrtArticlePageParser::extractDescription($html); $this->assertEquals('Description with & entities ', $result); } public function test_extract_description_strips_tags_from_paragraph(): void { $html = '

Paragraph with bold and italic text.

'; $result = VrtArticlePageParser::extractDescription($html); $this->assertEquals('Paragraph with bold and italic text.', $result); } public function test_extract_description_returns_null_when_none_found(): void { $html = '
No paragraphs or meta description
'; $result = VrtArticlePageParser::extractDescription($html); $this->assertNull($result); } public function test_extract_full_article_returns_all_paragraphs(): void { $html = '

First paragraph content.

Second paragraph with more text.

Third paragraph here.

'; $result = VrtArticlePageParser::extractFullArticle($html); $expected = "First paragraph content.\n\nSecond paragraph with more text.\n\nThird paragraph here."; $this->assertEquals($expected, $result); } public function test_extract_full_article_removes_script_and_style_tags(): void { $html = '

Actual content paragraph.

'; $result = VrtArticlePageParser::extractFullArticle($html); $this->assertEquals('Actual content paragraph.', $result); } public function test_extract_full_article_strips_tags_from_paragraphs(): void { $html = '

Paragraph with bold and link tags.

'; $result = VrtArticlePageParser::extractFullArticle($html); $this->assertEquals('Paragraph with bold and link tags.', $result); } public function test_extract_full_article_filters_out_empty_paragraphs(): void { $html = '

First paragraph.

Second paragraph.

'; $result = VrtArticlePageParser::extractFullArticle($html); $this->assertEquals("First paragraph.\n\nSecond paragraph.", $result); } public function test_extract_full_article_decodes_html_entities(): void { $html = '

Text with & entities and "quotes".

'; $result = VrtArticlePageParser::extractFullArticle($html); $this->assertEquals('Text with & entities and "quotes".', $result); } public function test_extract_full_article_returns_null_when_no_paragraphs(): void { $html = '
No paragraph tags
'; $result = VrtArticlePageParser::extractFullArticle($html); $this->assertNull($result); } public function test_extract_thumbnail_returns_og_image_when_present(): void { $html = ''; $result = VrtArticlePageParser::extractThumbnail($html); $this->assertEquals('https://example.com/image.jpg', $result); } public function test_extract_thumbnail_returns_first_img_src_when_og_image_not_present(): void { $html = 'Photo'; $result = VrtArticlePageParser::extractThumbnail($html); $this->assertEquals('https://example.com/photo.png', $result); } public function test_extract_thumbnail_returns_null_when_none_found(): void { $html = '
No images here
'; $result = VrtArticlePageParser::extractThumbnail($html); $this->assertNull($result); } public function test_extract_data_returns_all_extracted_fields(): void { $html = '

First paragraph of article.

Second paragraph of article.

'; $result = VrtArticlePageParser::extractData($html); $this->assertIsArray($result); $this->assertEquals('Article Title', $result['title']); $this->assertEquals('Article Description', $result['description']); $this->assertEquals("First paragraph of article.\n\nSecond paragraph of article.", $result['full_article']); $this->assertEquals('https://example.com/thumb.jpg', $result['thumbnail']); } public function test_extract_data_handles_missing_elements(): void { $html = '
Minimal content
'; $result = VrtArticlePageParser::extractData($html); $this->assertIsArray($result); $this->assertArrayHasKey('title', $result); $this->assertArrayHasKey('description', $result); $this->assertArrayHasKey('full_article', $result); $this->assertArrayHasKey('thumbnail', $result); $this->assertNull($result['title']); $this->assertNull($result['description']); $this->assertNull($result['full_article']); $this->assertNull($result['thumbnail']); } public function test_extract_data_with_partial_content(): void { $html = ' Just Title

Single paragraph

'; $result = VrtArticlePageParser::extractData($html); $this->assertEquals('Just Title', $result['title']); $this->assertEquals('Single paragraph', $result['description']); $this->assertEquals('Single paragraph', $result['full_article']); $this->assertNull($result['thumbnail']); } public function test_extract_title_prioritizes_og_title_over_h1_and_title(): void { $html = ' Page Title

H1 Title

'; $result = VrtArticlePageParser::extractTitle($html); $this->assertEquals('OG Title', $result); } public function test_extract_title_prioritizes_h1_over_title_when_no_og_title(): void { $html = ' Page Title

H1 Title

'; $result = VrtArticlePageParser::extractTitle($html); $this->assertEquals('H1 Title', $result); } public function test_extract_description_prioritizes_og_description_over_paragraph(): void { $html = '

First paragraph content

'; $result = VrtArticlePageParser::extractDescription($html); $this->assertEquals('OG Description', $result); } public function test_extract_thumbnail_prioritizes_og_image_over_img_src(): void { $html = ' Image '; $result = VrtArticlePageParser::extractThumbnail($html); $this->assertEquals('https://example.com/og-image.jpg', $result); } }