'; $title = GuardianArticlePageParser::extractTitle($html); $this->assertEquals('Guardian Article Title', $title); } public function test_extract_title_from_h1_tag(): void { $html = '

H1 Title Test

'; $title = GuardianArticlePageParser::extractTitle($html); $this->assertEquals('H1 Title Test', $title); } public function test_extract_title_from_title_tag(): void { $html = 'Page Title Test'; $title = GuardianArticlePageParser::extractTitle($html); $this->assertEquals('Page Title Test', $title); } public function test_extract_title_with_html_entities(): void { $html = ''; $title = GuardianArticlePageParser::extractTitle($html); $this->assertEquals('Test & Article "Title"', $title); } public function test_extract_title_returns_null_when_not_found(): void { $html = '

No title here

'; $title = GuardianArticlePageParser::extractTitle($html); $this->assertNull($title); } public function test_extract_description_from_og_meta_tag(): void { $html = ''; $description = GuardianArticlePageParser::extractDescription($html); $this->assertEquals('Guardian article description', $description); } public function test_extract_description_from_paragraph(): void { $html = '

This is the first paragraph description.

'; $description = GuardianArticlePageParser::extractDescription($html); $this->assertEquals('This is the first paragraph description.', $description); } public function test_extract_description_returns_null_when_not_found(): void { $html = '
No description here
'; $description = GuardianArticlePageParser::extractDescription($html); $this->assertNull($description); } public function test_extract_full_article_from_guardian_article_body(): void { $html = '

First paragraph of the article.

Second paragraph of the article.

'; $fullArticle = GuardianArticlePageParser::extractFullArticle($html); $expected = "First paragraph of the article.\n\nSecond paragraph of the article."; $this->assertEquals($expected, $fullArticle); } public function test_extract_full_article_fallback_to_all_paragraphs(): void { $html = '

First general paragraph.

Second general paragraph.

'; $fullArticle = GuardianArticlePageParser::extractFullArticle($html); $expected = "First general paragraph.\n\nSecond general paragraph."; $this->assertEquals($expected, $fullArticle); } public function test_extract_full_article_filters_empty_paragraphs(): void { $html = '

Content paragraph.

Another content paragraph.

'; $fullArticle = GuardianArticlePageParser::extractFullArticle($html); $expected = "Content paragraph.\n\nAnother content paragraph."; $this->assertEquals($expected, $fullArticle); } public function test_extract_full_article_handles_nested_tags(): void { $html = '

This has bold text and italic text.

This has a link inside.

'; $fullArticle = GuardianArticlePageParser::extractFullArticle($html); $expected = "This has bold text and italic text.\n\nThis has a link inside."; $this->assertEquals($expected, $fullArticle); } public function test_extract_full_article_removes_scripts_and_styles(): void { $html = '

Clean content.

'; $fullArticle = GuardianArticlePageParser::extractFullArticle($html); $this->assertEquals('Clean content.', $fullArticle); $this->assertStringNotContainsString('console.log', $fullArticle); $this->assertStringNotContainsString('alert', $fullArticle); } public function test_extract_full_article_returns_null_when_no_content(): void { $html = '
No paragraphs here
'; $fullArticle = GuardianArticlePageParser::extractFullArticle($html); $this->assertNull($fullArticle); } public function test_extract_thumbnail_from_og_image(): void { $html = ''; $thumbnail = GuardianArticlePageParser::extractThumbnail($html); $this->assertEquals('https://i.guim.co.uk/img/test.jpg', $thumbnail); } public function test_extract_thumbnail_from_img_tag(): void { $html = 'test'; $thumbnail = GuardianArticlePageParser::extractThumbnail($html); $this->assertEquals('https://i.guim.co.uk/img/article-image.png', $thumbnail); } public function test_extract_thumbnail_returns_null_when_not_found(): void { $html = '
No images here
'; $thumbnail = GuardianArticlePageParser::extractThumbnail($html); $this->assertNull($thumbnail); } public function test_extract_data_returns_all_components(): void { $html = '

Full article content here.

'; $data = GuardianArticlePageParser::extractData($html); $this->assertIsArray($data); $this->assertArrayHasKey('title', $data); $this->assertArrayHasKey('description', $data); $this->assertArrayHasKey('full_article', $data); $this->assertArrayHasKey('thumbnail', $data); $this->assertEquals('Guardian Test Article', $data['title']); $this->assertEquals('Test description', $data['description']); $this->assertEquals('Full article content here.', $data['full_article']); $this->assertEquals('https://i.guim.co.uk/img/image.jpg', $data['thumbnail']); } public function test_extract_data_handles_missing_components_gracefully(): void { $html = '
Minimal content
'; $data = GuardianArticlePageParser::extractData($html); $this->assertIsArray($data); $this->assertNull($data['title']); $this->assertNull($data['description']); $this->assertNull($data['full_article']); $this->assertNull($data['thumbnail']); } public function test_extract_full_article_with_realistic_guardian_html(): void { $html = '

The prime minister has announced a new climate policy that aims to reduce carbon emissions by 50% by 2030.

The announcement came during a press conference at Downing Street on Tuesday afternoon.

Environmental groups have cautiously welcomed the move, while industry leaders have expressed concern about the timeline.

'; $fullArticle = GuardianArticlePageParser::extractFullArticle($html); $this->assertNotNull($fullArticle); $this->assertStringContainsString('climate policy', $fullArticle); $this->assertStringContainsString('press conference', $fullArticle); $this->assertStringContainsString('Environmental groups', $fullArticle); $this->assertStringContainsString("\n\n", $fullArticle); $this->assertStringNotContainsString('', $fullArticle); } }