fedi-feed-router/backend/tests/Unit/Services/Parsers/BelgaArticlePageParserTest.php

<?php

namespace Tests\Unit\Services\Parsers;

use App\Services\Parsers\BelgaArticlePageParser;
use Tests\TestCase;

class BelgaArticlePageParserTest extends TestCase
{
    public function test_extract_title_from_og_meta_tag(): void
    {
        $html = '<html><head><meta property="og:title" content="Test Article Title"/></head><body></body></html>';

        $title = BelgaArticlePageParser::extractTitle($html);

        $this->assertEquals('Test Article Title', $title);
    }

    public function test_extract_title_from_h1_tag(): void
    {
        $html = '<html><body><h1>H1 Title Test</h1></body></html>';

        $title = BelgaArticlePageParser::extractTitle($html);

        $this->assertEquals('H1 Title Test', $title);
    }

    public function test_extract_title_from_title_tag(): void
    {
        $html = '<html><head><title>Page Title Test</title></head><body></body></html>';

        $title = BelgaArticlePageParser::extractTitle($html);

        $this->assertEquals('Page Title Test', $title);
    }

    public function test_extract_title_with_html_entities(): void
    {
        $html = '<html><head><meta property="og:title" content="Test &amp; Article &quot;Title&quot;"/></head></html>';

        $title = BelgaArticlePageParser::extractTitle($html);

        $this->assertEquals('Test & Article "Title"', $title);
    }

    public function test_extract_title_returns_null_when_not_found(): void
    {
        $html = '<html><body><p>No title here</p></body></html>';

        $title = BelgaArticlePageParser::extractTitle($html);

        $this->assertNull($title);
    }

    public function test_extract_description_from_og_meta_tag(): void
    {
        $html = '<html><head><meta property="og:description" content="Test article description"/></head></html>';

        $description = BelgaArticlePageParser::extractDescription($html);

        $this->assertEquals('Test article description', $description);
    }

    public function test_extract_description_from_paragraph(): void
    {
        $html = '<html><body><p>This is the first paragraph description.</p></body></html>';

        $description = BelgaArticlePageParser::extractDescription($html);

        $this->assertEquals('This is the first paragraph description.', $description);
    }

    public function test_extract_description_with_html_entities(): void
    {
        $html = '<html><head><meta property="og:description" content="Description with &amp; entities &lt;test&gt;"/></head></html>';

        $description = BelgaArticlePageParser::extractDescription($html);

        $this->assertEquals('Description with & entities <test>', $description);
    }

    public function test_extract_description_returns_null_when_not_found(): void
    {
        $html = '<html><body><div>No description here</div></body></html>';

        $description = BelgaArticlePageParser::extractDescription($html);

        $this->assertNull($description);
    }

    public function test_extract_full_article_from_belga_paragraph_class(): void
    {
        $html = '
            <html>
                <body>
                    <p class="styles_paragraph__6o_o7">First paragraph content.</p>
                    <p class="styles_paragraph__6o_o7">Second paragraph content.</p>
                    <p class="other-class">This should be ignored.</p>
                </body>
            </html>
        ';

        $fullArticle = BelgaArticlePageParser::extractFullArticle($html);

        $expected = "First paragraph content.\n\nSecond paragraph content.";
        $this->assertEquals($expected, $fullArticle);
    }

    public function test_extract_full_article_filters_empty_paragraphs(): void
    {
        $html = '
            <html>
                <body>
                    <p class="styles_paragraph__ABC123">Content paragraph.</p>
                    <p class="styles_paragraph__DEF456">   </p>
                    <p class="styles_paragraph__GHI789"></p>
                    <p class="styles_paragraph__JKL012">Another content paragraph.</p>
                </body>
            </html>
        ';

        $fullArticle = BelgaArticlePageParser::extractFullArticle($html);

        $expected = "Content paragraph.\n\nAnother content paragraph.";
        $this->assertEquals($expected, $fullArticle);
    }

    public function test_extract_full_article_handles_nested_tags(): void
    {
        $html = '
            <html>
                <body>
                    <p class="styles_paragraph__TEST">This has <strong>bold text</strong> and <em>italic text</em>.</p>
                    <p class="styles_paragraph__TEST2">This has <a href="#">a link</a> inside.</p>
                </body>
            </html>
        ';

        $fullArticle = BelgaArticlePageParser::extractFullArticle($html);

        $expected = "This has bold text and italic text.\n\nThis has a link inside.";
        $this->assertEquals($expected, $fullArticle);
    }

    public function test_extract_full_article_removes_scripts_and_styles(): void
    {
        $html = '
            <html>
                <head>
                    <script>console.log("test");</script>
                    <style>.test { color: red; }</style>
                </head>
                <body>
                    <p class="styles_paragraph__TEST">Clean content.</p>
                    <script>alert("bad");</script>
                </body>
            </html>
        ';

        $fullArticle = BelgaArticlePageParser::extractFullArticle($html);

        $this->assertEquals('Clean content.', $fullArticle);
        $this->assertStringNotContainsString('console.log', $fullArticle);
        $this->assertStringNotContainsString('alert', $fullArticle);
        $this->assertStringNotContainsString('color: red', $fullArticle);
    }

    public function test_extract_full_article_fallback_to_prezly_document(): void
    {
        $html = '
            <html>
                <body>
                    <section class="prezly-slate-document">
                        <p>Content from prezly section.</p>
                        <p>More prezly content.</p>
                    </section>
                </body>
            </html>
        ';

        $fullArticle = BelgaArticlePageParser::extractFullArticle($html);

        $expected = "Content from prezly section.\n\nMore prezly content.";
        $this->assertEquals($expected, $fullArticle);
    }

    public function test_extract_full_article_fallback_to_all_paragraphs(): void
    {
        $html = '
            <html>
                <body>
                    <p>First general paragraph.</p>
                    <p>Second general paragraph.</p>
                </body>
            </html>
        ';

        $fullArticle = BelgaArticlePageParser::extractFullArticle($html);

        $expected = "First general paragraph.\n\nSecond general paragraph.";
        $this->assertEquals($expected, $fullArticle);
    }

    public function test_extract_full_article_returns_null_when_no_content(): void
    {
        $html = '<html><body><div>No paragraphs here</div></body></html>';

        $fullArticle = BelgaArticlePageParser::extractFullArticle($html);

        $this->assertNull($fullArticle);
    }

    public function test_extract_thumbnail_from_og_image(): void
    {
        $html = '<html><head><meta property="og:image" content="https://example.com/image.jpg"/></head></html>';

        $thumbnail = BelgaArticlePageParser::extractThumbnail($html);

        $this->assertEquals('https://example.com/image.jpg', $thumbnail);
    }

    public function test_extract_thumbnail_from_img_tag(): void
    {
        $html = '<html><body><img src="https://example.com/article-image.png" alt="test"/></body></html>';

        $thumbnail = BelgaArticlePageParser::extractThumbnail($html);

        $this->assertEquals('https://example.com/article-image.png', $thumbnail);
    }

    public function test_extract_thumbnail_prefers_og_image(): void
    {
        $html = '
            <html>
                <head><meta property="og:image" content="https://example.com/og-image.jpg"/></head>
                <body><img src="https://example.com/body-image.png" alt="test"/></body>
            </html>
        ';

        $thumbnail = BelgaArticlePageParser::extractThumbnail($html);

        $this->assertEquals('https://example.com/og-image.jpg', $thumbnail);
    }

    public function test_extract_thumbnail_returns_null_when_not_found(): void
    {
        $html = '<html><body><div>No images here</div></body></html>';

        $thumbnail = BelgaArticlePageParser::extractThumbnail($html);

        $this->assertNull($thumbnail);
    }

    public function test_extract_data_returns_all_components(): void
    {
        $html = '
            <html>
                <head>
                    <meta property="og:title" content="Test Article"/>
                    <meta property="og:description" content="Test description"/>
                    <meta property="og:image" content="https://example.com/image.jpg"/>
                </head>
                <body>
                    <p class="styles_paragraph__TEST">Full article content here.</p>
                </body>
            </html>
        ';

        $data = BelgaArticlePageParser::extractData($html);

        $this->assertIsArray($data);
        $this->assertArrayHasKey('title', $data);
        $this->assertArrayHasKey('description', $data);
        $this->assertArrayHasKey('full_article', $data);
        $this->assertArrayHasKey('thumbnail', $data);

        $this->assertEquals('Test Article', $data['title']);
        $this->assertEquals('Test description', $data['description']);
        $this->assertEquals('Full article content here.', $data['full_article']);
        $this->assertEquals('https://example.com/image.jpg', $data['thumbnail']);
    }

    public function test_extract_data_handles_missing_components_gracefully(): void
    {
        $html = '<html><body><div>Minimal content</div></body></html>';

        $data = BelgaArticlePageParser::extractData($html);

        $this->assertIsArray($data);
        $this->assertArrayHasKey('title', $data);
        $this->assertArrayHasKey('description', $data);
        $this->assertArrayHasKey('full_article', $data);
        $this->assertArrayHasKey('thumbnail', $data);

        $this->assertNull($data['title']);
        $this->assertNull($data['description']);
        $this->assertNull($data['full_article']);
        $this->assertNull($data['thumbnail']);
    }

    /**
     * Test based on actual Belga HTML structure from real article
     */
    public function test_extract_full_article_with_realistic_belga_html(): void
    {
        $html = '
            <html>
                <body>
                    <div class="ContentRenderer_renderer__IBbst">
                        <section class="prezly-slate-document">
                            <p class="styles_paragraph__6o_o7"><strong>Around 110,000 people joined the Antwerp Pride Parade on Saturday afternoon, according to police.</strong></p>
                            <p class="styles_paragraph__6o_o7">The event passed without major incidents. Earlier in the day, far-right group Voorpost held a pre-approved protest.</p>
                            <p class="styles_paragraph__6o_o7">Police say they expect no problems with crowd dispersal, as departures will be staggered.</p>
                        </section>
                    </div>
                </body>
            </html>
        ';

        $fullArticle = BelgaArticlePageParser::extractFullArticle($html);

        $this->assertNotNull($fullArticle);
        $this->assertStringContainsString('110,000 people joined', $fullArticle);
        $this->assertStringContainsString('major incidents', $fullArticle);
        $this->assertStringContainsString('crowd dispersal', $fullArticle);
        
        // Should join paragraphs with double newlines
        $this->assertStringContainsString("\n\n", $fullArticle);
        
        // Should strip HTML tags
        $this->assertStringNotContainsString('<strong>', $fullArticle);
        $this->assertStringNotContainsString('</strong>', $fullArticle);
    }
}
Fix article validation 2025-08-09 21:32:46 +02:00			`<?php`

			`namespace Tests\Unit\Services\Parsers;`

			`use App\Services\Parsers\BelgaArticlePageParser;`
			`use Tests\TestCase;`

			`class BelgaArticlePageParserTest extends TestCase`
			`{`
			`public function test_extract_title_from_og_meta_tag(): void`
			`{`
			`$html = '<html><head><meta property="og:title" content="Test Article Title"/></head><body></body></html>';`

			`$title = BelgaArticlePageParser::extractTitle($html);`

			`$this->assertEquals('Test Article Title', $title);`
			`}`

			`public function test_extract_title_from_h1_tag(): void`
			`{`
			`$html = '<html><body><h1>H1 Title Test</h1></body></html>';`

			`$title = BelgaArticlePageParser::extractTitle($html);`

			`$this->assertEquals('H1 Title Test', $title);`
			`}`

			`public function test_extract_title_from_title_tag(): void`
			`{`
			`$html = '<html><head><title>Page Title Test</title></head><body></body></html>';`

			`$title = BelgaArticlePageParser::extractTitle($html);`

			`$this->assertEquals('Page Title Test', $title);`
			`}`

			`public function test_extract_title_with_html_entities(): void`
			`{`
			`$html = '<html><head><meta property="og:title" content="Test & Article "Title""/></head></html>';`

			`$title = BelgaArticlePageParser::extractTitle($html);`

			`$this->assertEquals('Test & Article "Title"', $title);`
			`}`

			`public function test_extract_title_returns_null_when_not_found(): void`
			`{`
			`$html = '<html><body><p>No title here</p></body></html>';`

			`$title = BelgaArticlePageParser::extractTitle($html);`

			`$this->assertNull($title);`
			`}`

			`public function test_extract_description_from_og_meta_tag(): void`
			`{`
			`$html = '<html><head><meta property="og:description" content="Test article description"/></head></html>';`

			`$description = BelgaArticlePageParser::extractDescription($html);`

			`$this->assertEquals('Test article description', $description);`
			`}`

			`public function test_extract_description_from_paragraph(): void`
			`{`
			`$html = '<html><body><p>This is the first paragraph description.</p></body></html>';`

			`$description = BelgaArticlePageParser::extractDescription($html);`

			`$this->assertEquals('This is the first paragraph description.', $description);`
			`}`

			`public function test_extract_description_with_html_entities(): void`
			`{`
			`$html = '<html><head><meta property="og:description" content="Description with & entities <test>"/></head></html>';`

			`$description = BelgaArticlePageParser::extractDescription($html);`

			`$this->assertEquals('Description with & entities <test>', $description);`
			`}`

			`public function test_extract_description_returns_null_when_not_found(): void`
			`{`
			`$html = '<html><body><div>No description here</div></body></html>';`

			`$description = BelgaArticlePageParser::extractDescription($html);`

			`$this->assertNull($description);`
			`}`

			`public function test_extract_full_article_from_belga_paragraph_class(): void`
			`{`
			`$html = '`
			`<html>`
			`<body>`
			`<p class="styles_paragraph__6o_o7">First paragraph content.</p>`
			`<p class="styles_paragraph__6o_o7">Second paragraph content.</p>`
			`<p class="other-class">This should be ignored.</p>`
			`</body>`
			`</html>`
			`';`

			`$fullArticle = BelgaArticlePageParser::extractFullArticle($html);`

			`$expected = "First paragraph content.\n\nSecond paragraph content.";`
			`$this->assertEquals($expected, $fullArticle);`
			`}`

			`public function test_extract_full_article_filters_empty_paragraphs(): void`
			`{`
			`$html = '`
			`<html>`
			`<body>`
			`<p class="styles_paragraph__ABC123">Content paragraph.</p>`
			`<p class="styles_paragraph__DEF456"> </p>`
			`<p class="styles_paragraph__GHI789"></p>`
			`<p class="styles_paragraph__JKL012">Another content paragraph.</p>`
			`</body>`
			`</html>`
			`';`

			`$fullArticle = BelgaArticlePageParser::extractFullArticle($html);`

			`$expected = "Content paragraph.\n\nAnother content paragraph.";`
			`$this->assertEquals($expected, $fullArticle);`
			`}`

			`public function test_extract_full_article_handles_nested_tags(): void`
			`{`
			`$html = '`
			`<html>`
			`<body>`
			`<p class="styles_paragraph__TEST">This has <strong>bold text</strong> and <em>italic text</em>.</p>`
			`<p class="styles_paragraph__TEST2">This has <a href="#">a link</a> inside.</p>`
			`</body>`
			`</html>`
			`';`

			`$fullArticle = BelgaArticlePageParser::extractFullArticle($html);`

			`$expected = "This has bold text and italic text.\n\nThis has a link inside.";`
			`$this->assertEquals($expected, $fullArticle);`
			`}`

			`public function test_extract_full_article_removes_scripts_and_styles(): void`
			`{`
			`$html = '`
			`<html>`
			`<head>`
			`<script>console.log("test");</script>`
			`<style>.test { color: red; }</style>`
			`</head>`
			`<body>`
			`<p class="styles_paragraph__TEST">Clean content.</p>`
			`<script>alert("bad");</script>`
			`</body>`
			`</html>`
			`';`

			`$fullArticle = BelgaArticlePageParser::extractFullArticle($html);`

			`$this->assertEquals('Clean content.', $fullArticle);`
			`$this->assertStringNotContainsString('console.log', $fullArticle);`
			`$this->assertStringNotContainsString('alert', $fullArticle);`
			`$this->assertStringNotContainsString('color: red', $fullArticle);`
			`}`

			`public function test_extract_full_article_fallback_to_prezly_document(): void`
			`{`
			`$html = '`
			`<html>`
			`<body>`
			`<section class="prezly-slate-document">`
			`<p>Content from prezly section.</p>`
			`<p>More prezly content.</p>`
			`</section>`
			`</body>`
			`</html>`
			`';`

			`$fullArticle = BelgaArticlePageParser::extractFullArticle($html);`

			`$expected = "Content from prezly section.\n\nMore prezly content.";`
			`$this->assertEquals($expected, $fullArticle);`
			`}`

			`public function test_extract_full_article_fallback_to_all_paragraphs(): void`
			`{`
			`$html = '`
			`<html>`
			`<body>`
			`<p>First general paragraph.</p>`
			`<p>Second general paragraph.</p>`
			`</body>`
			`</html>`
			`';`

			`$fullArticle = BelgaArticlePageParser::extractFullArticle($html);`

			`$expected = "First general paragraph.\n\nSecond general paragraph.";`
			`$this->assertEquals($expected, $fullArticle);`
			`}`

			`public function test_extract_full_article_returns_null_when_no_content(): void`
			`{`
			`$html = '<html><body><div>No paragraphs here</div></body></html>';`

			`$fullArticle = BelgaArticlePageParser::extractFullArticle($html);`

			`$this->assertNull($fullArticle);`
			`}`

			`public function test_extract_thumbnail_from_og_image(): void`
			`{`
			`$html = '<html><head><meta property="og:image" content="https://example.com/image.jpg"/></head></html>';`

			`$thumbnail = BelgaArticlePageParser::extractThumbnail($html);`

			`$this->assertEquals('https://example.com/image.jpg', $thumbnail);`
			`}`

			`public function test_extract_thumbnail_from_img_tag(): void`
			`{`
			`$html = '<html><body><img src="https://example.com/article-image.png" alt="test"/></body></html>';`

			`$thumbnail = BelgaArticlePageParser::extractThumbnail($html);`

			`$this->assertEquals('https://example.com/article-image.png', $thumbnail);`
			`}`

			`public function test_extract_thumbnail_prefers_og_image(): void`
			`{`
			`$html = '`
			`<html>`
			`<head><meta property="og:image" content="https://example.com/og-image.jpg"/></head>`
			`<body><img src="https://example.com/body-image.png" alt="test"/></body>`
			`</html>`
			`';`

			`$thumbnail = BelgaArticlePageParser::extractThumbnail($html);`

			`$this->assertEquals('https://example.com/og-image.jpg', $thumbnail);`
			`}`

			`public function test_extract_thumbnail_returns_null_when_not_found(): void`
			`{`
			`$html = '<html><body><div>No images here</div></body></html>';`

			`$thumbnail = BelgaArticlePageParser::extractThumbnail($html);`

			`$this->assertNull($thumbnail);`
			`}`

			`public function test_extract_data_returns_all_components(): void`
			`{`
			`$html = '`
			`<html>`
			`<head>`
			`<meta property="og:title" content="Test Article"/>`
			`<meta property="og:description" content="Test description"/>`
			`<meta property="og:image" content="https://example.com/image.jpg"/>`
			`</head>`
			`<body>`
			`<p class="styles_paragraph__TEST">Full article content here.</p>`
			`</body>`
			`</html>`
			`';`

			`$data = BelgaArticlePageParser::extractData($html);`

			`$this->assertIsArray($data);`
			`$this->assertArrayHasKey('title', $data);`
			`$this->assertArrayHasKey('description', $data);`
			`$this->assertArrayHasKey('full_article', $data);`
			`$this->assertArrayHasKey('thumbnail', $data);`

			`$this->assertEquals('Test Article', $data['title']);`
			`$this->assertEquals('Test description', $data['description']);`
			`$this->assertEquals('Full article content here.', $data['full_article']);`
			`$this->assertEquals('https://example.com/image.jpg', $data['thumbnail']);`
			`}`

			`public function test_extract_data_handles_missing_components_gracefully(): void`
			`{`
			`$html = '<html><body><div>Minimal content</div></body></html>';`

			`$data = BelgaArticlePageParser::extractData($html);`

			`$this->assertIsArray($data);`
			`$this->assertArrayHasKey('title', $data);`
			`$this->assertArrayHasKey('description', $data);`
			`$this->assertArrayHasKey('full_article', $data);`
			`$this->assertArrayHasKey('thumbnail', $data);`

			`$this->assertNull($data['title']);`
			`$this->assertNull($data['description']);`
			`$this->assertNull($data['full_article']);`
			`$this->assertNull($data['thumbnail']);`
			`}`

			`/**`
			`* Test based on actual Belga HTML structure from real article`
			`*/`
			`public function test_extract_full_article_with_realistic_belga_html(): void`
			`{`
			`$html = '`
			`<html>`
			`<body>`
			`<div class="ContentRenderer_renderer__IBbst">`
			`<section class="prezly-slate-document">`
			`<p class="styles_paragraph__6o_o7"><strong>Around 110,000 people joined the Antwerp Pride Parade on Saturday afternoon, according to police.</strong></p>`
			`<p class="styles_paragraph__6o_o7">The event passed without major incidents. Earlier in the day, far-right group Voorpost held a pre-approved protest.</p>`
			`<p class="styles_paragraph__6o_o7">Police say they expect no problems with crowd dispersal, as departures will be staggered.</p>`
			`</section>`
			`</div>`
			`</body>`
			`</html>`
			`';`

			`$fullArticle = BelgaArticlePageParser::extractFullArticle($html);`

			`$this->assertNotNull($fullArticle);`
			`$this->assertStringContainsString('110,000 people joined', $fullArticle);`
			`$this->assertStringContainsString('major incidents', $fullArticle);`
			`$this->assertStringContainsString('crowd dispersal', $fullArticle);`

			`// Should join paragraphs with double newlines`
			`$this->assertStringContainsString("\n\n", $fullArticle);`

			`// Should strip HTML tags`
			`$this->assertStringNotContainsString('<strong>', $fullArticle);`
			`$this->assertStringNotContainsString('</strong>', $fullArticle);`
			`}`
			`}`