fedi-feed-router/backend/tests/Unit/Services/Parsers/VrtArticlePageParserTest.php

325 lines
12 KiB
PHP
Raw Normal View History

2025-08-07 21:19:19 +02:00
<?php
namespace Tests\Unit\Services\Parsers;
use App\Services\Parsers\VrtArticlePageParser;
use Tests\TestCase;
class VrtArticlePageParserTest extends TestCase
{
public function test_extract_title_returns_og_title_when_present(): void
{
$html = '<html><head><meta property="og:title" content="VRT News Article Title"/></head></html>';
$result = VrtArticlePageParser::extractTitle($html);
$this->assertEquals('VRT News Article Title', $result);
}
public function test_extract_title_returns_h1_when_og_title_not_present(): void
{
$html = '<html><body><h1>Main Article Heading</h1></body></html>';
$result = VrtArticlePageParser::extractTitle($html);
$this->assertEquals('Main Article Heading', $result);
}
public function test_extract_title_returns_title_tag_when_og_title_and_h1_not_present(): void
{
$html = '<html><head><title>Page Title</title></head></html>';
$result = VrtArticlePageParser::extractTitle($html);
$this->assertEquals('Page Title', $result);
}
public function test_extract_title_decodes_html_entities(): void
{
$html = '<html><head><meta property="og:title" content="Title with &amp; special &quot;chars&quot;"/></head></html>';
$result = VrtArticlePageParser::extractTitle($html);
$this->assertEquals('Title with & special "chars"', $result);
}
public function test_extract_title_handles_h1_content_with_attributes(): void
{
$html = '<html><body><h1 class="title">Simple H1 Title</h1></body></html>';
$result = VrtArticlePageParser::extractTitle($html);
$this->assertEquals('Simple H1 Title', $result);
}
public function test_extract_title_handles_h1_with_nested_tags(): void
{
// Should extract content from h1 and strip nested tags
$html = '<html><body><h1>Title with <span>nested</span> tags</h1></body></html>';
$result = VrtArticlePageParser::extractTitle($html);
// Should extract and strip tags to get clean text
$this->assertEquals('Title with nested tags', $result);
}
public function test_extract_title_returns_null_when_none_found(): void
{
$html = '<html><body><p>No title tags here</p></body></html>';
$result = VrtArticlePageParser::extractTitle($html);
$this->assertNull($result);
}
public function test_extract_description_returns_og_description_when_present(): void
{
$html = '<html><head><meta property="og:description" content="This is the article description"/></head></html>';
$result = VrtArticlePageParser::extractDescription($html);
$this->assertEquals('This is the article description', $result);
}
public function test_extract_description_returns_first_paragraph_when_og_description_not_present(): void
{
$html = '<html><body><p>This is the first paragraph content.</p><p>Second paragraph.</p></body></html>';
$result = VrtArticlePageParser::extractDescription($html);
$this->assertEquals('This is the first paragraph content.', $result);
}
public function test_extract_description_decodes_html_entities(): void
{
$html = '<html><head><meta property="og:description" content="Description with &amp; entities &lt;test&gt;"/></head></html>';
$result = VrtArticlePageParser::extractDescription($html);
$this->assertEquals('Description with & entities <test>', $result);
}
public function test_extract_description_strips_tags_from_paragraph(): void
{
$html = '<html><body><p>Paragraph with <strong>bold</strong> and <em>italic</em> text.</p></body></html>';
$result = VrtArticlePageParser::extractDescription($html);
$this->assertEquals('Paragraph with bold and italic text.', $result);
}
public function test_extract_description_returns_null_when_none_found(): void
{
$html = '<html><body><div>No paragraphs or meta description</div></body></html>';
$result = VrtArticlePageParser::extractDescription($html);
$this->assertNull($result);
}
public function test_extract_full_article_returns_all_paragraphs(): void
{
$html = '<html><body>
<p>First paragraph content.</p>
<p>Second paragraph with more text.</p>
<p>Third paragraph here.</p>
</body></html>';
$result = VrtArticlePageParser::extractFullArticle($html);
$expected = "First paragraph content.\n\nSecond paragraph with more text.\n\nThird paragraph here.";
$this->assertEquals($expected, $result);
}
public function test_extract_full_article_removes_script_and_style_tags(): void
{
$html = '<html><body>
<script>alert("test");</script>
<style>body { color: red; }</style>
<p>Actual content paragraph.</p>
</body></html>';
$result = VrtArticlePageParser::extractFullArticle($html);
$this->assertEquals('Actual content paragraph.', $result);
}
public function test_extract_full_article_strips_tags_from_paragraphs(): void
{
$html = '<html><body>
<p>Paragraph with <strong>bold</strong> and <a href="#">link</a> tags.</p>
</body></html>';
$result = VrtArticlePageParser::extractFullArticle($html);
$this->assertEquals('Paragraph with bold and link tags.', $result);
}
public function test_extract_full_article_filters_out_empty_paragraphs(): void
{
$html = '<html><body>
<p>First paragraph.</p>
<p></p>
<p> </p>
<p>Second paragraph.</p>
</body></html>';
$result = VrtArticlePageParser::extractFullArticle($html);
$this->assertEquals("First paragraph.\n\nSecond paragraph.", $result);
}
public function test_extract_full_article_decodes_html_entities(): void
{
$html = '<html><body>
<p>Text with &amp; entities and &quot;quotes&quot;.</p>
</body></html>';
$result = VrtArticlePageParser::extractFullArticle($html);
$this->assertEquals('Text with & entities and "quotes".', $result);
}
public function test_extract_full_article_returns_null_when_no_paragraphs(): void
{
$html = '<html><body><div>No paragraph tags</div></body></html>';
$result = VrtArticlePageParser::extractFullArticle($html);
$this->assertNull($result);
}
public function test_extract_thumbnail_returns_og_image_when_present(): void
{
$html = '<html><head><meta property="og:image" content="https://example.com/image.jpg"/></head></html>';
$result = VrtArticlePageParser::extractThumbnail($html);
$this->assertEquals('https://example.com/image.jpg', $result);
}
public function test_extract_thumbnail_returns_first_img_src_when_og_image_not_present(): void
{
$html = '<html><body><img src="https://example.com/photo.png" alt="Photo"/></body></html>';
$result = VrtArticlePageParser::extractThumbnail($html);
$this->assertEquals('https://example.com/photo.png', $result);
}
public function test_extract_thumbnail_returns_null_when_none_found(): void
{
$html = '<html><body><div>No images here</div></body></html>';
$result = VrtArticlePageParser::extractThumbnail($html);
$this->assertNull($result);
}
public function test_extract_data_returns_all_extracted_fields(): void
{
$html = '<html>
<head>
<meta property="og:title" content="Article Title"/>
<meta property="og:description" content="Article Description"/>
<meta property="og:image" content="https://example.com/thumb.jpg"/>
</head>
<body>
<p>First paragraph of article.</p>
<p>Second paragraph of article.</p>
</body>
</html>';
$result = VrtArticlePageParser::extractData($html);
$this->assertIsArray($result);
$this->assertEquals('Article Title', $result['title']);
$this->assertEquals('Article Description', $result['description']);
$this->assertEquals("First paragraph of article.\n\nSecond paragraph of article.", $result['full_article']);
$this->assertEquals('https://example.com/thumb.jpg', $result['thumbnail']);
}
public function test_extract_data_handles_missing_elements(): void
{
$html = '<html><body><div>Minimal content</div></body></html>';
$result = VrtArticlePageParser::extractData($html);
$this->assertIsArray($result);
$this->assertArrayHasKey('title', $result);
$this->assertArrayHasKey('description', $result);
$this->assertArrayHasKey('full_article', $result);
$this->assertArrayHasKey('thumbnail', $result);
$this->assertNull($result['title']);
$this->assertNull($result['description']);
$this->assertNull($result['full_article']);
$this->assertNull($result['thumbnail']);
}
public function test_extract_data_with_partial_content(): void
{
$html = '<html>
<head><title>Just Title</title></head>
<body><p>Single paragraph</p></body>
</html>';
$result = VrtArticlePageParser::extractData($html);
$this->assertEquals('Just Title', $result['title']);
$this->assertEquals('Single paragraph', $result['description']);
$this->assertEquals('Single paragraph', $result['full_article']);
$this->assertNull($result['thumbnail']);
}
public function test_extract_title_prioritizes_og_title_over_h1_and_title(): void
{
$html = '<html>
<head>
<title>Page Title</title>
<meta property="og:title" content="OG Title"/>
</head>
<body><h1>H1 Title</h1></body>
</html>';
$result = VrtArticlePageParser::extractTitle($html);
$this->assertEquals('OG Title', $result);
}
public function test_extract_title_prioritizes_h1_over_title_when_no_og_title(): void
{
$html = '<html>
<head><title>Page Title</title></head>
<body><h1>H1 Title</h1></body>
</html>';
$result = VrtArticlePageParser::extractTitle($html);
$this->assertEquals('H1 Title', $result);
}
public function test_extract_description_prioritizes_og_description_over_paragraph(): void
{
$html = '<html>
<head><meta property="og:description" content="OG Description"/></head>
<body><p>First paragraph content</p></body>
</html>';
$result = VrtArticlePageParser::extractDescription($html);
$this->assertEquals('OG Description', $result);
}
public function test_extract_thumbnail_prioritizes_og_image_over_img_src(): void
{
$html = '<html>
<head><meta property="og:image" content="https://example.com/og-image.jpg"/></head>
<body><img src="https://example.com/img-src.jpg" alt="Image"/></body>
</html>';
$result = VrtArticlePageParser::extractThumbnail($html);
$this->assertEquals('https://example.com/og-image.jpg', $result);
}
}