fedi-feed-router/backend/tests/Unit/Services/Parsers/VrtArticlePageParserTest.php

<?php

namespace Tests\Unit\Services\Parsers;

use App\Services\Parsers\VrtArticlePageParser;
use Tests\TestCase;

class VrtArticlePageParserTest extends TestCase
{
    public function test_extract_title_returns_og_title_when_present(): void
    {
        $html = '<html><head><meta property="og:title" content="VRT News Article Title"/></head></html>';
        
        $result = VrtArticlePageParser::extractTitle($html);
        
        $this->assertEquals('VRT News Article Title', $result);
    }

    public function test_extract_title_returns_h1_when_og_title_not_present(): void
    {
        $html = '<html><body><h1>Main Article Heading</h1></body></html>';
        
        $result = VrtArticlePageParser::extractTitle($html);
        
        $this->assertEquals('Main Article Heading', $result);
    }

    public function test_extract_title_returns_title_tag_when_og_title_and_h1_not_present(): void
    {
        $html = '<html><head><title>Page Title</title></head></html>';
        
        $result = VrtArticlePageParser::extractTitle($html);
        
        $this->assertEquals('Page Title', $result);
    }

    public function test_extract_title_decodes_html_entities(): void
    {
        $html = '<html><head><meta property="og:title" content="Title with &amp; special &quot;chars&quot;"/></head></html>';
        
        $result = VrtArticlePageParser::extractTitle($html);
        
        $this->assertEquals('Title with & special "chars"', $result);
    }

    public function test_extract_title_handles_h1_content_with_attributes(): void
    {
        $html = '<html><body><h1 class="title">Simple H1 Title</h1></body></html>';
        
        $result = VrtArticlePageParser::extractTitle($html);
        
        $this->assertEquals('Simple H1 Title', $result);
    }

    public function test_extract_title_handles_h1_with_nested_tags(): void
    {
        // Should extract content from h1 and strip nested tags
        $html = '<html><body><h1>Title with <span>nested</span> tags</h1></body></html>';
        
        $result = VrtArticlePageParser::extractTitle($html);
        
        // Should extract and strip tags to get clean text
        $this->assertEquals('Title with nested tags', $result);
    }

    public function test_extract_title_returns_null_when_none_found(): void
    {
        $html = '<html><body><p>No title tags here</p></body></html>';
        
        $result = VrtArticlePageParser::extractTitle($html);
        
        $this->assertNull($result);
    }

    public function test_extract_description_returns_og_description_when_present(): void
    {
        $html = '<html><head><meta property="og:description" content="This is the article description"/></head></html>';
        
        $result = VrtArticlePageParser::extractDescription($html);
        
        $this->assertEquals('This is the article description', $result);
    }

    public function test_extract_description_returns_first_paragraph_when_og_description_not_present(): void
    {
        $html = '<html><body><p>This is the first paragraph content.</p><p>Second paragraph.</p></body></html>';
        
        $result = VrtArticlePageParser::extractDescription($html);
        
        $this->assertEquals('This is the first paragraph content.', $result);
    }

    public function test_extract_description_decodes_html_entities(): void
    {
        $html = '<html><head><meta property="og:description" content="Description with &amp; entities &lt;test&gt;"/></head></html>';
        
        $result = VrtArticlePageParser::extractDescription($html);
        
        $this->assertEquals('Description with & entities <test>', $result);
    }

    public function test_extract_description_strips_tags_from_paragraph(): void
    {
        $html = '<html><body><p>Paragraph with <strong>bold</strong> and <em>italic</em> text.</p></body></html>';
        
        $result = VrtArticlePageParser::extractDescription($html);
        
        $this->assertEquals('Paragraph with bold and italic text.', $result);
    }

    public function test_extract_description_returns_null_when_none_found(): void
    {
        $html = '<html><body><div>No paragraphs or meta description</div></body></html>';
        
        $result = VrtArticlePageParser::extractDescription($html);
        
        $this->assertNull($result);
    }

    public function test_extract_full_article_returns_all_paragraphs(): void
    {
        $html = '<html><body>
            <p>First paragraph content.</p>
            <p>Second paragraph with more text.</p>
            <p>Third paragraph here.</p>
        </body></html>';
        
        $result = VrtArticlePageParser::extractFullArticle($html);
        
        $expected = "First paragraph content.\n\nSecond paragraph with more text.\n\nThird paragraph here.";
        $this->assertEquals($expected, $result);
    }

    public function test_extract_full_article_removes_script_and_style_tags(): void
    {
        $html = '<html><body>
            <script>alert("test");</script>
            <style>body { color: red; }</style>
            <p>Actual content paragraph.</p>
        </body></html>';
        
        $result = VrtArticlePageParser::extractFullArticle($html);
        
        $this->assertEquals('Actual content paragraph.', $result);
    }

    public function test_extract_full_article_strips_tags_from_paragraphs(): void
    {
        $html = '<html><body>
            <p>Paragraph with <strong>bold</strong> and <a href="#">link</a> tags.</p>
        </body></html>';
        
        $result = VrtArticlePageParser::extractFullArticle($html);
        
        $this->assertEquals('Paragraph with bold and link tags.', $result);
    }

    public function test_extract_full_article_filters_out_empty_paragraphs(): void
    {
        $html = '<html><body>
            <p>First paragraph.</p>
            <p></p>
            <p>   </p>
            <p>Second paragraph.</p>
        </body></html>';
        
        $result = VrtArticlePageParser::extractFullArticle($html);
        
        $this->assertEquals("First paragraph.\n\nSecond paragraph.", $result);
    }

    public function test_extract_full_article_decodes_html_entities(): void
    {
        $html = '<html><body>
            <p>Text with &amp; entities and &quot;quotes&quot;.</p>
        </body></html>';
        
        $result = VrtArticlePageParser::extractFullArticle($html);
        
        $this->assertEquals('Text with & entities and "quotes".', $result);
    }

    public function test_extract_full_article_returns_null_when_no_paragraphs(): void
    {
        $html = '<html><body><div>No paragraph tags</div></body></html>';
        
        $result = VrtArticlePageParser::extractFullArticle($html);
        
        $this->assertNull($result);
    }

    public function test_extract_thumbnail_returns_og_image_when_present(): void
    {
        $html = '<html><head><meta property="og:image" content="https://example.com/image.jpg"/></head></html>';
        
        $result = VrtArticlePageParser::extractThumbnail($html);
        
        $this->assertEquals('https://example.com/image.jpg', $result);
    }

    public function test_extract_thumbnail_returns_first_img_src_when_og_image_not_present(): void
    {
        $html = '<html><body><img src="https://example.com/photo.png" alt="Photo"/></body></html>';
        
        $result = VrtArticlePageParser::extractThumbnail($html);
        
        $this->assertEquals('https://example.com/photo.png', $result);
    }

    public function test_extract_thumbnail_returns_null_when_none_found(): void
    {
        $html = '<html><body><div>No images here</div></body></html>';
        
        $result = VrtArticlePageParser::extractThumbnail($html);
        
        $this->assertNull($result);
    }

    public function test_extract_data_returns_all_extracted_fields(): void
    {
        $html = '<html>
            <head>
                <meta property="og:title" content="Article Title"/>
                <meta property="og:description" content="Article Description"/>
                <meta property="og:image" content="https://example.com/thumb.jpg"/>
            </head>
            <body>
                <p>First paragraph of article.</p>
                <p>Second paragraph of article.</p>
            </body>
        </html>';
        
        $result = VrtArticlePageParser::extractData($html);
        
        $this->assertIsArray($result);
        $this->assertEquals('Article Title', $result['title']);
        $this->assertEquals('Article Description', $result['description']);
        $this->assertEquals("First paragraph of article.\n\nSecond paragraph of article.", $result['full_article']);
        $this->assertEquals('https://example.com/thumb.jpg', $result['thumbnail']);
    }

    public function test_extract_data_handles_missing_elements(): void
    {
        $html = '<html><body><div>Minimal content</div></body></html>';
        
        $result = VrtArticlePageParser::extractData($html);
        
        $this->assertIsArray($result);
        $this->assertArrayHasKey('title', $result);
        $this->assertArrayHasKey('description', $result);
        $this->assertArrayHasKey('full_article', $result);
        $this->assertArrayHasKey('thumbnail', $result);
        
        $this->assertNull($result['title']);
        $this->assertNull($result['description']);
        $this->assertNull($result['full_article']);
        $this->assertNull($result['thumbnail']);
    }

    public function test_extract_data_with_partial_content(): void
    {
        $html = '<html>
            <head><title>Just Title</title></head>
            <body><p>Single paragraph</p></body>
        </html>';
        
        $result = VrtArticlePageParser::extractData($html);
        
        $this->assertEquals('Just Title', $result['title']);
        $this->assertEquals('Single paragraph', $result['description']);
        $this->assertEquals('Single paragraph', $result['full_article']);
        $this->assertNull($result['thumbnail']);
    }

    public function test_extract_title_prioritizes_og_title_over_h1_and_title(): void
    {
        $html = '<html>
            <head>
                <title>Page Title</title>
                <meta property="og:title" content="OG Title"/>
            </head>
            <body><h1>H1 Title</h1></body>
        </html>';
        
        $result = VrtArticlePageParser::extractTitle($html);
        
        $this->assertEquals('OG Title', $result);
    }

    public function test_extract_title_prioritizes_h1_over_title_when_no_og_title(): void
    {
        $html = '<html>
            <head><title>Page Title</title></head>
            <body><h1>H1 Title</h1></body>
        </html>';
        
        $result = VrtArticlePageParser::extractTitle($html);
        
        $this->assertEquals('H1 Title', $result);
    }

    public function test_extract_description_prioritizes_og_description_over_paragraph(): void
    {
        $html = '<html>
            <head><meta property="og:description" content="OG Description"/></head>
            <body><p>First paragraph content</p></body>
        </html>';
        
        $result = VrtArticlePageParser::extractDescription($html);
        
        $this->assertEquals('OG Description', $result);
    }

    public function test_extract_thumbnail_prioritizes_og_image_over_img_src(): void
    {
        $html = '<html>
            <head><meta property="og:image" content="https://example.com/og-image.jpg"/></head>
            <body><img src="https://example.com/img-src.jpg" alt="Image"/></body>
        </html>';
        
        $result = VrtArticlePageParser::extractThumbnail($html);
        
        $this->assertEquals('https://example.com/og-image.jpg', $result);
    }
}
Increase test coverage to 78 2025-08-07 21:19:19 +02:00			`<?php`

			`namespace Tests\Unit\Services\Parsers;`

			`use App\Services\Parsers\VrtArticlePageParser;`
			`use Tests\TestCase;`

			`class VrtArticlePageParserTest extends TestCase`
			`{`
			`public function test_extract_title_returns_og_title_when_present(): void`
			`{`
			`$html = '<html><head><meta property="og:title" content="VRT News Article Title"/></head></html>';`

			`$result = VrtArticlePageParser::extractTitle($html);`

			`$this->assertEquals('VRT News Article Title', $result);`
			`}`

			`public function test_extract_title_returns_h1_when_og_title_not_present(): void`
			`{`
			`$html = '<html><body><h1>Main Article Heading</h1></body></html>';`

			`$result = VrtArticlePageParser::extractTitle($html);`

			`$this->assertEquals('Main Article Heading', $result);`
			`}`

			`public function test_extract_title_returns_title_tag_when_og_title_and_h1_not_present(): void`
			`{`
			`$html = '<html><head><title>Page Title</title></head></html>';`

			`$result = VrtArticlePageParser::extractTitle($html);`

			`$this->assertEquals('Page Title', $result);`
			`}`

			`public function test_extract_title_decodes_html_entities(): void`
			`{`
			`$html = '<html><head><meta property="og:title" content="Title with & special "chars""/></head></html>';`

			`$result = VrtArticlePageParser::extractTitle($html);`

			`$this->assertEquals('Title with & special "chars"', $result);`
			`}`

			`public function test_extract_title_handles_h1_content_with_attributes(): void`
			`{`
			`$html = '<html><body><h1 class="title">Simple H1 Title</h1></body></html>';`

			`$result = VrtArticlePageParser::extractTitle($html);`

			`$this->assertEquals('Simple H1 Title', $result);`
			`}`

			`public function test_extract_title_handles_h1_with_nested_tags(): void`
			`{`
			`// Should extract content from h1 and strip nested tags`
			`$html = '<html><body><h1>Title with <span>nested</span> tags</h1></body></html>';`

			`$result = VrtArticlePageParser::extractTitle($html);`

			`// Should extract and strip tags to get clean text`
			`$this->assertEquals('Title with nested tags', $result);`
			`}`

			`public function test_extract_title_returns_null_when_none_found(): void`
			`{`
			`$html = '<html><body><p>No title tags here</p></body></html>';`

			`$result = VrtArticlePageParser::extractTitle($html);`

			`$this->assertNull($result);`
			`}`

			`public function test_extract_description_returns_og_description_when_present(): void`
			`{`
			`$html = '<html><head><meta property="og:description" content="This is the article description"/></head></html>';`

			`$result = VrtArticlePageParser::extractDescription($html);`

			`$this->assertEquals('This is the article description', $result);`
			`}`

			`public function test_extract_description_returns_first_paragraph_when_og_description_not_present(): void`
			`{`
			`$html = '<html><body><p>This is the first paragraph content.</p><p>Second paragraph.</p></body></html>';`

			`$result = VrtArticlePageParser::extractDescription($html);`

			`$this->assertEquals('This is the first paragraph content.', $result);`
			`}`

			`public function test_extract_description_decodes_html_entities(): void`
			`{`
			`$html = '<html><head><meta property="og:description" content="Description with & entities <test>"/></head></html>';`

			`$result = VrtArticlePageParser::extractDescription($html);`

			`$this->assertEquals('Description with & entities <test>', $result);`
			`}`

			`public function test_extract_description_strips_tags_from_paragraph(): void`
			`{`
			`$html = '<html><body><p>Paragraph with <strong>bold</strong> and <em>italic</em> text.</p></body></html>';`

			`$result = VrtArticlePageParser::extractDescription($html);`

			`$this->assertEquals('Paragraph with bold and italic text.', $result);`
			`}`

			`public function test_extract_description_returns_null_when_none_found(): void`
			`{`
			`$html = '<html><body><div>No paragraphs or meta description</div></body></html>';`

			`$result = VrtArticlePageParser::extractDescription($html);`

			`$this->assertNull($result);`
			`}`

			`public function test_extract_full_article_returns_all_paragraphs(): void`
			`{`
			`$html = '<html><body>`
			`<p>First paragraph content.</p>`
			`<p>Second paragraph with more text.</p>`
			`<p>Third paragraph here.</p>`
			`</body></html>';`

			`$result = VrtArticlePageParser::extractFullArticle($html);`

			`$expected = "First paragraph content.\n\nSecond paragraph with more text.\n\nThird paragraph here.";`
			`$this->assertEquals($expected, $result);`
			`}`

			`public function test_extract_full_article_removes_script_and_style_tags(): void`
			`{`
			`$html = '<html><body>`
			`<script>alert("test");</script>`
			`<style>body { color: red; }</style>`
			`<p>Actual content paragraph.</p>`
			`</body></html>';`

			`$result = VrtArticlePageParser::extractFullArticle($html);`

			`$this->assertEquals('Actual content paragraph.', $result);`
			`}`

			`public function test_extract_full_article_strips_tags_from_paragraphs(): void`
			`{`
			`$html = '<html><body>`
			`<p>Paragraph with <strong>bold</strong> and <a href="#">link</a> tags.</p>`
			`</body></html>';`

			`$result = VrtArticlePageParser::extractFullArticle($html);`

			`$this->assertEquals('Paragraph with bold and link tags.', $result);`
			`}`

			`public function test_extract_full_article_filters_out_empty_paragraphs(): void`
			`{`
			`$html = '<html><body>`
			`<p>First paragraph.</p>`
			`<p></p>`
			`<p> </p>`
			`<p>Second paragraph.</p>`
			`</body></html>';`

			`$result = VrtArticlePageParser::extractFullArticle($html);`

			`$this->assertEquals("First paragraph.\n\nSecond paragraph.", $result);`
			`}`

			`public function test_extract_full_article_decodes_html_entities(): void`
			`{`
			`$html = '<html><body>`
			`<p>Text with & entities and "quotes".</p>`
			`</body></html>';`

			`$result = VrtArticlePageParser::extractFullArticle($html);`

			`$this->assertEquals('Text with & entities and "quotes".', $result);`
			`}`

			`public function test_extract_full_article_returns_null_when_no_paragraphs(): void`
			`{`
			`$html = '<html><body><div>No paragraph tags</div></body></html>';`

			`$result = VrtArticlePageParser::extractFullArticle($html);`

			`$this->assertNull($result);`
			`}`

			`public function test_extract_thumbnail_returns_og_image_when_present(): void`
			`{`
			`$html = '<html><head><meta property="og:image" content="https://example.com/image.jpg"/></head></html>';`

			`$result = VrtArticlePageParser::extractThumbnail($html);`

			`$this->assertEquals('https://example.com/image.jpg', $result);`
			`}`

			`public function test_extract_thumbnail_returns_first_img_src_when_og_image_not_present(): void`
			`{`
			`$html = '<html><body><img src="https://example.com/photo.png" alt="Photo"/></body></html>';`

			`$result = VrtArticlePageParser::extractThumbnail($html);`

			`$this->assertEquals('https://example.com/photo.png', $result);`
			`}`

			`public function test_extract_thumbnail_returns_null_when_none_found(): void`
			`{`
			`$html = '<html><body><div>No images here</div></body></html>';`

			`$result = VrtArticlePageParser::extractThumbnail($html);`

			`$this->assertNull($result);`
			`}`

			`public function test_extract_data_returns_all_extracted_fields(): void`
			`{`
			`$html = '<html>`
			`<head>`
			`<meta property="og:title" content="Article Title"/>`
			`<meta property="og:description" content="Article Description"/>`
			`<meta property="og:image" content="https://example.com/thumb.jpg"/>`
			`</head>`
			`<body>`
			`<p>First paragraph of article.</p>`
			`<p>Second paragraph of article.</p>`
			`</body>`
			`</html>';`

			`$result = VrtArticlePageParser::extractData($html);`

			`$this->assertIsArray($result);`
			`$this->assertEquals('Article Title', $result['title']);`
			`$this->assertEquals('Article Description', $result['description']);`
			`$this->assertEquals("First paragraph of article.\n\nSecond paragraph of article.", $result['full_article']);`
			`$this->assertEquals('https://example.com/thumb.jpg', $result['thumbnail']);`
			`}`

			`public function test_extract_data_handles_missing_elements(): void`
			`{`
			`$html = '<html><body><div>Minimal content</div></body></html>';`

			`$result = VrtArticlePageParser::extractData($html);`

			`$this->assertIsArray($result);`
			`$this->assertArrayHasKey('title', $result);`
			`$this->assertArrayHasKey('description', $result);`
			`$this->assertArrayHasKey('full_article', $result);`
			`$this->assertArrayHasKey('thumbnail', $result);`

			`$this->assertNull($result['title']);`
			`$this->assertNull($result['description']);`
			`$this->assertNull($result['full_article']);`
			`$this->assertNull($result['thumbnail']);`
			`}`

			`public function test_extract_data_with_partial_content(): void`
			`{`
			`$html = '<html>`
			`<head><title>Just Title</title></head>`
			`<body><p>Single paragraph</p></body>`
			`</html>';`

			`$result = VrtArticlePageParser::extractData($html);`

			`$this->assertEquals('Just Title', $result['title']);`
			`$this->assertEquals('Single paragraph', $result['description']);`
			`$this->assertEquals('Single paragraph', $result['full_article']);`
			`$this->assertNull($result['thumbnail']);`
			`}`

			`public function test_extract_title_prioritizes_og_title_over_h1_and_title(): void`
			`{`
			`$html = '<html>`
			`<head>`
			`<title>Page Title</title>`
			`<meta property="og:title" content="OG Title"/>`
			`</head>`
			`<body><h1>H1 Title</h1></body>`
			`</html>';`

			`$result = VrtArticlePageParser::extractTitle($html);`

			`$this->assertEquals('OG Title', $result);`
			`}`

			`public function test_extract_title_prioritizes_h1_over_title_when_no_og_title(): void`
			`{`
			`$html = '<html>`
			`<head><title>Page Title</title></head>`
			`<body><h1>H1 Title</h1></body>`
			`</html>';`

			`$result = VrtArticlePageParser::extractTitle($html);`

			`$this->assertEquals('H1 Title', $result);`
			`}`

			`public function test_extract_description_prioritizes_og_description_over_paragraph(): void`
			`{`
			`$html = '<html>`
			`<head><meta property="og:description" content="OG Description"/></head>`
			`<body><p>First paragraph content</p></body>`
			`</html>';`

			`$result = VrtArticlePageParser::extractDescription($html);`

			`$this->assertEquals('OG Description', $result);`
			`}`

			`public function test_extract_thumbnail_prioritizes_og_image_over_img_src(): void`
			`{`
			`$html = '<html>`
			`<head><meta property="og:image" content="https://example.com/og-image.jpg"/></head>`
			`<body><img src="https://example.com/img-src.jpg" alt="Image"/></body>`
			`</html>';`

			`$result = VrtArticlePageParser::extractThumbnail($html);`

			`$this->assertEquals('https://example.com/og-image.jpg', $result);`
			`}`
			`}`