2026-03-08 11:02:46 +01:00
|
|
|
<?php
|
|
|
|
|
|
|
|
|
|
namespace Tests\Unit\Services\Parsers;
|
|
|
|
|
|
|
|
|
|
use App\Services\Parsers\GuardianArticlePageParser;
|
|
|
|
|
use Tests\TestCase;
|
|
|
|
|
|
|
|
|
|
class GuardianArticlePageParserTest extends TestCase
|
|
|
|
|
{
|
|
|
|
|
public function test_extract_title_from_og_meta_tag(): void
|
|
|
|
|
{
|
|
|
|
|
$html = '<html><head><meta property="og:title" content="Guardian Article Title"/></head><body></body></html>';
|
|
|
|
|
|
|
|
|
|
$title = GuardianArticlePageParser::extractTitle($html);
|
|
|
|
|
|
|
|
|
|
$this->assertEquals('Guardian Article Title', $title);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public function test_extract_title_from_h1_tag(): void
|
|
|
|
|
{
|
|
|
|
|
$html = '<html><body><h1>H1 Title Test</h1></body></html>';
|
|
|
|
|
|
|
|
|
|
$title = GuardianArticlePageParser::extractTitle($html);
|
|
|
|
|
|
|
|
|
|
$this->assertEquals('H1 Title Test', $title);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public function test_extract_title_from_title_tag(): void
|
|
|
|
|
{
|
|
|
|
|
$html = '<html><head><title>Page Title Test</title></head><body></body></html>';
|
|
|
|
|
|
|
|
|
|
$title = GuardianArticlePageParser::extractTitle($html);
|
|
|
|
|
|
|
|
|
|
$this->assertEquals('Page Title Test', $title);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public function test_extract_title_with_html_entities(): void
|
|
|
|
|
{
|
|
|
|
|
$html = '<html><head><meta property="og:title" content="Test & Article "Title""/></head></html>';
|
|
|
|
|
|
|
|
|
|
$title = GuardianArticlePageParser::extractTitle($html);
|
|
|
|
|
|
|
|
|
|
$this->assertEquals('Test & Article "Title"', $title);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public function test_extract_title_returns_null_when_not_found(): void
|
|
|
|
|
{
|
|
|
|
|
$html = '<html><body><p>No title here</p></body></html>';
|
|
|
|
|
|
|
|
|
|
$title = GuardianArticlePageParser::extractTitle($html);
|
|
|
|
|
|
|
|
|
|
$this->assertNull($title);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public function test_extract_description_from_og_meta_tag(): void
|
|
|
|
|
{
|
|
|
|
|
$html = '<html><head><meta property="og:description" content="Guardian article description"/></head></html>';
|
|
|
|
|
|
|
|
|
|
$description = GuardianArticlePageParser::extractDescription($html);
|
|
|
|
|
|
|
|
|
|
$this->assertEquals('Guardian article description', $description);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public function test_extract_description_from_paragraph(): void
|
|
|
|
|
{
|
|
|
|
|
$html = '<html><body><p>This is the first paragraph description.</p></body></html>';
|
|
|
|
|
|
|
|
|
|
$description = GuardianArticlePageParser::extractDescription($html);
|
|
|
|
|
|
|
|
|
|
$this->assertEquals('This is the first paragraph description.', $description);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public function test_extract_description_returns_null_when_not_found(): void
|
|
|
|
|
{
|
|
|
|
|
$html = '<html><body><div>No description here</div></body></html>';
|
|
|
|
|
|
|
|
|
|
$description = GuardianArticlePageParser::extractDescription($html);
|
|
|
|
|
|
|
|
|
|
$this->assertNull($description);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public function test_extract_full_article_from_guardian_article_body(): void
|
|
|
|
|
{
|
|
|
|
|
$html = '
|
|
|
|
|
<html>
|
|
|
|
|
<body>
|
|
|
|
|
<div class="article-body-commercial-selector">
|
|
|
|
|
<p>First paragraph of the article.</p>
|
|
|
|
|
<p>Second paragraph of the article.</p>
|
|
|
|
|
</div>
|
|
|
|
|
</body>
|
|
|
|
|
</html>
|
|
|
|
|
';
|
|
|
|
|
|
|
|
|
|
$fullArticle = GuardianArticlePageParser::extractFullArticle($html);
|
|
|
|
|
|
|
|
|
|
$expected = "First paragraph of the article.\n\nSecond paragraph of the article.";
|
|
|
|
|
$this->assertEquals($expected, $fullArticle);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public function test_extract_full_article_fallback_to_all_paragraphs(): void
|
|
|
|
|
{
|
|
|
|
|
$html = '
|
|
|
|
|
<html>
|
|
|
|
|
<body>
|
|
|
|
|
<p>First general paragraph.</p>
|
|
|
|
|
<p>Second general paragraph.</p>
|
|
|
|
|
</body>
|
|
|
|
|
</html>
|
|
|
|
|
';
|
|
|
|
|
|
|
|
|
|
$fullArticle = GuardianArticlePageParser::extractFullArticle($html);
|
|
|
|
|
|
|
|
|
|
$expected = "First general paragraph.\n\nSecond general paragraph.";
|
|
|
|
|
$this->assertEquals($expected, $fullArticle);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public function test_extract_full_article_filters_empty_paragraphs(): void
|
|
|
|
|
{
|
|
|
|
|
$html = '
|
|
|
|
|
<html>
|
|
|
|
|
<body>
|
|
|
|
|
<div class="article-body-commercial-selector">
|
|
|
|
|
<p>Content paragraph.</p>
|
|
|
|
|
<p> </p>
|
|
|
|
|
<p></p>
|
|
|
|
|
<p>Another content paragraph.</p>
|
|
|
|
|
</div>
|
|
|
|
|
</body>
|
|
|
|
|
</html>
|
|
|
|
|
';
|
|
|
|
|
|
|
|
|
|
$fullArticle = GuardianArticlePageParser::extractFullArticle($html);
|
|
|
|
|
|
|
|
|
|
$expected = "Content paragraph.\n\nAnother content paragraph.";
|
|
|
|
|
$this->assertEquals($expected, $fullArticle);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public function test_extract_full_article_handles_nested_tags(): void
|
|
|
|
|
{
|
|
|
|
|
$html = '
|
|
|
|
|
<html>
|
|
|
|
|
<body>
|
|
|
|
|
<div class="article-body-commercial-selector">
|
|
|
|
|
<p>This has <strong>bold text</strong> and <em>italic text</em>.</p>
|
|
|
|
|
<p>This has <a href="#">a link</a> inside.</p>
|
|
|
|
|
</div>
|
|
|
|
|
</body>
|
|
|
|
|
</html>
|
|
|
|
|
';
|
|
|
|
|
|
|
|
|
|
$fullArticle = GuardianArticlePageParser::extractFullArticle($html);
|
|
|
|
|
|
|
|
|
|
$expected = "This has bold text and italic text.\n\nThis has a link inside.";
|
|
|
|
|
$this->assertEquals($expected, $fullArticle);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public function test_extract_full_article_removes_scripts_and_styles(): void
|
|
|
|
|
{
|
|
|
|
|
$html = '
|
|
|
|
|
<html>
|
|
|
|
|
<head>
|
|
|
|
|
<script>console.log("test");</script>
|
|
|
|
|
<style>.test { color: red; }</style>
|
|
|
|
|
</head>
|
|
|
|
|
<body>
|
|
|
|
|
<div class="article-body-commercial-selector">
|
|
|
|
|
<p>Clean content.</p>
|
|
|
|
|
</div>
|
|
|
|
|
<script>alert("bad");</script>
|
|
|
|
|
</body>
|
|
|
|
|
</html>
|
|
|
|
|
';
|
|
|
|
|
|
|
|
|
|
$fullArticle = GuardianArticlePageParser::extractFullArticle($html);
|
|
|
|
|
|
|
|
|
|
$this->assertEquals('Clean content.', $fullArticle);
|
|
|
|
|
$this->assertStringNotContainsString('console.log', $fullArticle);
|
|
|
|
|
$this->assertStringNotContainsString('alert', $fullArticle);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public function test_extract_full_article_returns_null_when_no_content(): void
|
|
|
|
|
{
|
|
|
|
|
$html = '<html><body><div>No paragraphs here</div></body></html>';
|
|
|
|
|
|
|
|
|
|
$fullArticle = GuardianArticlePageParser::extractFullArticle($html);
|
|
|
|
|
|
|
|
|
|
$this->assertNull($fullArticle);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public function test_extract_thumbnail_from_og_image(): void
|
|
|
|
|
{
|
|
|
|
|
$html = '<html><head><meta property="og:image" content="https://i.guim.co.uk/img/test.jpg"/></head></html>';
|
|
|
|
|
|
|
|
|
|
$thumbnail = GuardianArticlePageParser::extractThumbnail($html);
|
|
|
|
|
|
|
|
|
|
$this->assertEquals('https://i.guim.co.uk/img/test.jpg', $thumbnail);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public function test_extract_thumbnail_from_img_tag(): void
|
|
|
|
|
{
|
|
|
|
|
$html = '<html><body><img src="https://i.guim.co.uk/img/article-image.png" alt="test"/></body></html>';
|
|
|
|
|
|
|
|
|
|
$thumbnail = GuardianArticlePageParser::extractThumbnail($html);
|
|
|
|
|
|
|
|
|
|
$this->assertEquals('https://i.guim.co.uk/img/article-image.png', $thumbnail);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public function test_extract_thumbnail_returns_null_when_not_found(): void
|
|
|
|
|
{
|
|
|
|
|
$html = '<html><body><div>No images here</div></body></html>';
|
|
|
|
|
|
|
|
|
|
$thumbnail = GuardianArticlePageParser::extractThumbnail($html);
|
|
|
|
|
|
|
|
|
|
$this->assertNull($thumbnail);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public function test_extract_data_returns_all_components(): void
|
|
|
|
|
{
|
|
|
|
|
$html = '
|
|
|
|
|
<html>
|
|
|
|
|
<head>
|
|
|
|
|
<meta property="og:title" content="Guardian Test Article"/>
|
|
|
|
|
<meta property="og:description" content="Test description"/>
|
|
|
|
|
<meta property="og:image" content="https://i.guim.co.uk/img/image.jpg"/>
|
|
|
|
|
</head>
|
|
|
|
|
<body>
|
|
|
|
|
<div class="article-body-commercial-selector">
|
|
|
|
|
<p>Full article content here.</p>
|
|
|
|
|
</div>
|
|
|
|
|
</body>
|
|
|
|
|
</html>
|
|
|
|
|
';
|
|
|
|
|
|
|
|
|
|
$data = GuardianArticlePageParser::extractData($html);
|
|
|
|
|
|
|
|
|
|
$this->assertIsArray($data);
|
|
|
|
|
$this->assertArrayHasKey('title', $data);
|
|
|
|
|
$this->assertArrayHasKey('description', $data);
|
|
|
|
|
$this->assertArrayHasKey('full_article', $data);
|
|
|
|
|
$this->assertArrayHasKey('thumbnail', $data);
|
|
|
|
|
|
|
|
|
|
$this->assertEquals('Guardian Test Article', $data['title']);
|
|
|
|
|
$this->assertEquals('Test description', $data['description']);
|
|
|
|
|
$this->assertEquals('Full article content here.', $data['full_article']);
|
|
|
|
|
$this->assertEquals('https://i.guim.co.uk/img/image.jpg', $data['thumbnail']);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public function test_extract_data_handles_missing_components_gracefully(): void
|
|
|
|
|
{
|
|
|
|
|
$html = '<html><body><div>Minimal content</div></body></html>';
|
|
|
|
|
|
|
|
|
|
$data = GuardianArticlePageParser::extractData($html);
|
|
|
|
|
|
|
|
|
|
$this->assertIsArray($data);
|
|
|
|
|
$this->assertNull($data['title']);
|
|
|
|
|
$this->assertNull($data['description']);
|
|
|
|
|
$this->assertNull($data['full_article']);
|
|
|
|
|
$this->assertNull($data['thumbnail']);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public function test_extract_full_article_with_realistic_guardian_html(): void
|
|
|
|
|
{
|
|
|
|
|
$html = '
|
|
|
|
|
<html>
|
|
|
|
|
<body>
|
|
|
|
|
<div class="article-body-commercial-selector">
|
|
|
|
|
<p><strong>The prime minister has announced a new climate policy that aims to reduce carbon emissions by 50% by 2030.</strong></p>
|
|
|
|
|
<p>The announcement came during a press conference at Downing Street on Tuesday afternoon.</p>
|
|
|
|
|
<p>Environmental groups have cautiously welcomed the move, while industry leaders have expressed concern about the timeline.</p>
|
|
|
|
|
</div>
|
|
|
|
|
</body>
|
|
|
|
|
</html>
|
|
|
|
|
';
|
|
|
|
|
|
|
|
|
|
$fullArticle = GuardianArticlePageParser::extractFullArticle($html);
|
|
|
|
|
|
|
|
|
|
$this->assertNotNull($fullArticle);
|
|
|
|
|
$this->assertStringContainsString('climate policy', $fullArticle);
|
|
|
|
|
$this->assertStringContainsString('press conference', $fullArticle);
|
|
|
|
|
$this->assertStringContainsString('Environmental groups', $fullArticle);
|
|
|
|
|
$this->assertStringContainsString("\n\n", $fullArticle);
|
|
|
|
|
$this->assertStringNotContainsString('<strong>', $fullArticle);
|
|
|
|
|
}
|
2026-03-08 14:18:28 +01:00
|
|
|
}
|