37 - Add The Guardian as RSS feed provider, implement RSS parsing

2026-03-08 11:02:46 +01:00 · 2026-03-08 11:02:46 +01:00 · 1e39a25f83
commit 1e39a25f83
parent 0123e20b1d
12 changed files with 754 additions and 10 deletions
--- a/app/Http/Requests/StoreFeedRequest.php
+++ b/app/Http/Requests/StoreFeedRequest.php
@ -12,13 +12,15 @@ public function authorize(): bool
    }
    /**
-     * @return array<string, string>
+     * @return array<string, mixed>
     */
    public function rules(): array
    {
        $providers = implode(',', array_keys(config('feed.providers', [])));
        return [
            'name' => 'required|string|max:255',
-            'provider' => 'required|in:vrt,belga',
+            'provider' => "required|in:{$providers}",
            'language_id' => 'required|exists:languages,id',
            'description' => 'nullable|string',
            'is_active' => 'boolean'
--- a/app/Services/Article/ArticleFetcher.php
+++ b/app/Services/Article/ArticleFetcher.php
@ -41,11 +41,47 @@ public function getArticlesFromFeed(Feed $feed): Collection
     */
    private function getArticlesFromRssFeed(Feed $feed): Collection
    {
-        // TODO: Implement RSS feed parsing
+        try {
-        // For now, return empty collection
+            $xml = HttpFetcher::fetchHtml($feed->url);
            $previousUseErrors = libxml_use_internal_errors(true);
            try {
                $rss = simplexml_load_string($xml);
            } finally {
                libxml_clear_errors();
                libxml_use_internal_errors($previousUseErrors);
            }
            if ($rss === false || !isset($rss->channel->item)) {
                $this->logSaver->warning("Failed to parse RSS feed XML", null, [
                    'feed_id' => $feed->id,
                    'feed_url' => $feed->url,
                ]);
                return collect();
            }
            $articles = collect();
            foreach ($rss->channel->item as $item) {
                $link = (string) $item->link;
                if ($link !== '') {
                    $articles->push($this->saveArticle($link, $feed->id));
                }
            }
            return $articles;
        } catch (Exception $e) {
            $this->logSaver->error("Failed to fetch articles from RSS feed", null, [
                'feed_id' => $feed->id,
                'feed_url' => $feed->url,
                'error' => $e->getMessage(),
            ]);
            return collect();
        }
    }
    /**
     * @return Collection<int, Article>
     */
--- a/app/Services/Factories/ArticleParserFactory.php
+++ b/app/Services/Factories/ArticleParserFactory.php
@ -6,6 +6,7 @@
 use App\Models\Feed;
 use App\Services\Parsers\VrtArticleParser;
 use App\Services\Parsers\BelgaArticleParser;
 use App\Services\Parsers\GuardianArticleParser;
 use Exception;
 class ArticleParserFactory
@ -16,6 +17,7 @@ class ArticleParserFactory
    private static array $parsers = [
        VrtArticleParser::class,
        BelgaArticleParser::class,
        GuardianArticleParser::class,
    ];
    /**
--- a/app/Services/Parsers/GuardianArticlePageParser.php
+++ b/app/Services/Parsers/GuardianArticlePageParser.php
@ -0,0 +1,110 @@
 <?php
 namespace App\Services\Parsers;
 class GuardianArticlePageParser
 {
    public static function extractTitle(string $html): ?string
    {
        // Try meta title first
        if (preg_match('/<meta property="og:title" content="([^"]+)"/i', $html, $matches)) {
            return html_entity_decode($matches[1], ENT_QUOTES, 'UTF-8');
        }
        // Try any h1 tag
        if (preg_match('/<h1[^>]*>([^<]+)<\/h1>/i', $html, $matches)) {
            return html_entity_decode(strip_tags($matches[1]), ENT_QUOTES, 'UTF-8');
        }
        // Try title tag
        if (preg_match('/<title>([^<]+)<\/title>/i', $html, $matches)) {
            return html_entity_decode($matches[1], ENT_QUOTES, 'UTF-8');
        }
        return null;
    }
    public static function extractDescription(string $html): ?string
    {
        // Try meta description first
        if (preg_match('/<meta property="og:description" content="([^"]+)"/i', $html, $matches)) {
            return html_entity_decode($matches[1], ENT_QUOTES, 'UTF-8');
        }
        // Try first paragraph
        if (preg_match('/<p[^>]*>([^<]+(?:<[^\/](?!p)[^>]*>[^<]*<\/[^>]*>[^<]*)*)<\/p>/i', $html, $matches)) {
            return html_entity_decode(strip_tags($matches[1]), ENT_QUOTES, 'UTF-8');
        }
        return null;
    }
    public static function extractFullArticle(string $html): ?string
    {
        // Remove scripts, styles, and other non-content elements
        $cleanHtml = preg_replace('/<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/mi', '', $html);
        $cleanHtml = preg_replace('/<style\b[^<]*(?:(?!<\/style>)<[^<]*)*<\/style>/mi', '', $cleanHtml);
        // Try Guardian-specific article body container (greedy to avoid stopping at nested divs)
        if (preg_match('/<div[^>]*class="[^"]*article-body-commercial-selector[^"]*"[^>]*>(.*)<\/div>/is', $cleanHtml, $sectionMatches)) {
            $sectionHtml = $sectionMatches[1];
            preg_match_all('/<p[^>]*>(.*?)<\/p>/is', $sectionHtml, $matches);
            if (!empty($matches[1])) {
                return self::joinParagraphs($matches[1]);
            }
        }
        // Fallback: extract all paragraph content
        preg_match_all('/<p[^>]*>(.*?)<\/p>/is', $cleanHtml, $matches);
        if (!empty($matches[1])) {
            return self::joinParagraphs($matches[1]);
        }
        return null;
    }
    public static function extractThumbnail(string $html): ?string
    {
        // Try OpenGraph image first
        if (preg_match('/<meta property="og:image" content="([^"]+)"/i', $html, $matches)) {
            return $matches[1];
        }
        // Try first image in content
        if (preg_match('/<img[^>]+src="([^"]+)"/i', $html, $matches)) {
            return $matches[1];
        }
        return null;
    }
    /**
     * @return array<string, string|null>
     */
    public static function extractData(string $html): array
    {
        return [
            'title' => self::extractTitle($html),
            'description' => self::extractDescription($html),
            'full_article' => self::extractFullArticle($html),
            'thumbnail' => self::extractThumbnail($html),
        ];
    }
    /**
     * @param array<int, string> $paragraphs
     */
    private static function joinParagraphs(array $paragraphs): ?string
    {
        $paragraphs = array_map(function ($paragraph) {
            return html_entity_decode(strip_tags($paragraph), ENT_QUOTES, 'UTF-8');
        }, $paragraphs);
        $fullText = implode("\n\n", array_filter($paragraphs, function ($p) {
            return trim($p) !== '';
        }));
        return $fullText ?: null;
    }
 }
--- a/app/Services/Parsers/GuardianArticleParser.php
+++ b/app/Services/Parsers/GuardianArticleParser.php
@ -0,0 +1,23 @@
 <?php
 namespace App\Services\Parsers;
 use App\Contracts\ArticleParserInterface;
 class GuardianArticleParser implements ArticleParserInterface
 {
    public function canParse(string $url): bool
    {
        return str_contains($url, 'theguardian.com');
    }
    public function extractData(string $html): array
    {
        return GuardianArticlePageParser::extractData($html);
    }
    public function getSourceName(): string
    {
        return 'The Guardian';
    }
 }
--- a/config/feed.php
+++ b/config/feed.php
@ -33,7 +33,7 @@
            'code' => 'belga',
            'name' => 'Belga News Agency',
            'description' => 'Belgian national news agency',
-            'type' => 'rss',
+            'type' => 'website',
            'is_active' => true,
            'languages' => [
                'en' => ['url' => 'https://www.belganewsagency.eu/'],
@ -44,6 +44,20 @@
                'article_page' => \App\Services\Parsers\BelgaArticlePageParser::class,
            ],
        ],
        'guardian' => [
            'code' => 'guardian',
            'name' => 'The Guardian',
            'description' => 'British daily newspaper',
            'type' => 'rss',
            'is_active' => true,
            'languages' => [
                'en' => ['url' => 'https://www.theguardian.com/international/rss'],
            ],
            'parsers' => [
                'article' => \App\Services\Parsers\GuardianArticleParser::class,
                'article_page' => \App\Services\Parsers\GuardianArticlePageParser::class,
            ],
        ],
    ],
    /*
--- a/tests/Feature/Http/Controllers/Api/V1/FeedsControllerTest.php
+++ b/tests/Feature/Http/Controllers/Api/V1/FeedsControllerTest.php
@ -99,7 +99,7 @@ public function test_store_creates_belga_feed_successfully(): void
                'data' => [
                    'name' => 'Belga Test Feed',
                    'url' => 'https://www.belganewsagency.eu/',
-                    'type' => 'rss',
+                    'type' => 'website',
                    'is_active' => true,
                ]
            ]);
@ -107,6 +107,38 @@ public function test_store_creates_belga_feed_successfully(): void
        $this->assertDatabaseHas('feeds', [
            'name' => 'Belga Test Feed',
            'url' => 'https://www.belganewsagency.eu/',
            'type' => 'website',
        ]);
    }
    public function test_store_creates_guardian_feed_successfully(): void
    {
        $language = Language::factory()->english()->create();
        $feedData = [
            'name' => 'Guardian Test Feed',
            'provider' => 'guardian',
            'language_id' => $language->id,
            'is_active' => true,
        ];
        $response = $this->postJson('/api/v1/feeds', $feedData);
        $response->assertStatus(201)
            ->assertJson([
                'success' => true,
                'message' => 'Feed created successfully!',
                'data' => [
                    'name' => 'Guardian Test Feed',
                    'url' => 'https://www.theguardian.com/international/rss',
                    'type' => 'rss',
                    'is_active' => true,
                ]
            ]);
        $this->assertDatabaseHas('feeds', [
            'name' => 'Guardian Test Feed',
            'url' => 'https://www.theguardian.com/international/rss',
            'type' => 'rss',
        ]);
    }
--- a/tests/Unit/Actions/CreateFeedActionTest.php
+++ b/tests/Unit/Actions/CreateFeedActionTest.php
@ -43,11 +43,23 @@ public function test_creates_belga_feed_with_correct_url(): void
        $feed = $this->action->execute('Belga News', 'belga', $language->id);
        $this->assertEquals('https://www.belganewsagency.eu/', $feed->url);
-        $this->assertEquals('rss', $feed->type);
+        $this->assertEquals('website', $feed->type);
        $this->assertEquals('belga', $feed->provider);
        $this->assertNull($feed->description);
    }
    public function test_creates_guardian_feed_with_correct_url(): void
    {
        $language = Language::factory()->create(['short_code' => 'en', 'is_active' => true]);
        $feed = $this->action->execute('Guardian News', 'guardian', $language->id);
        $this->assertEquals('https://www.theguardian.com/international/rss', $feed->url);
        $this->assertEquals('rss', $feed->type);
        $this->assertEquals('guardian', $feed->provider);
        $this->assertNull($feed->description);
    }
    public function test_creates_vrt_feed_with_dutch_language(): void
    {
        $language = Language::factory()->create(['short_code' => 'nl', 'is_active' => true]);
--- a/tests/Unit/Services/ArticleFetcherRssTest.php
+++ b/tests/Unit/Services/ArticleFetcherRssTest.php
@ -0,0 +1,164 @@
 <?php
 namespace Tests\Unit\Services;
 use App\Models\Article;
 use App\Models\Feed;
 use Illuminate\Foundation\Testing\RefreshDatabase;
 use Illuminate\Support\Facades\Http;
 use Mockery;
 use Tests\TestCase;
 use Tests\Traits\CreatesArticleFetcher;
 class ArticleFetcherRssTest extends TestCase
 {
    use RefreshDatabase, CreatesArticleFetcher;
    private string $sampleRss;
    protected function setUp(): void
    {
        parent::setUp();
        $this->sampleRss = <<<'XML'
 <?xml version="1.0" encoding="UTF-8"?>
 <rss version="2.0">
    <channel>
        <title>The Guardian - International</title>
        <link>https://www.theguardian.com/international</link>
        <item>
            <title>First Article Title</title>
            <link>https://www.theguardian.com/world/2026/mar/08/first-article</link>
            <description>First article description</description>
            <pubDate>Sun, 08 Mar 2026 12:00:00 GMT</pubDate>
        </item>
        <item>
            <title>Second Article Title</title>
            <link>https://www.theguardian.com/world/2026/mar/08/second-article</link>
            <description>Second article description</description>
            <pubDate>Sun, 08 Mar 2026 11:00:00 GMT</pubDate>
        </item>
    </channel>
 </rss>
 XML;
    }
    public function test_get_articles_from_rss_feed_returns_collection(): void
    {
        Http::fake(['*' => Http::response($this->sampleRss, 200)]);
        $feed = Feed::factory()->create([
            'type' => 'rss',
            'provider' => 'guardian',
            'url' => 'https://www.theguardian.com/international/rss',
        ]);
        $fetcher = $this->createArticleFetcher();
        $result = $fetcher->getArticlesFromFeed($feed);
        $this->assertInstanceOf(\Illuminate\Support\Collection::class, $result);
    }
    public function test_get_articles_from_rss_feed_creates_articles(): void
    {
        Http::fake(['*' => Http::response($this->sampleRss, 200)]);
        $feed = Feed::factory()->create([
            'type' => 'rss',
            'provider' => 'guardian',
            'url' => 'https://www.theguardian.com/international/rss',
        ]);
        $fetcher = $this->createArticleFetcher();
        $result = $fetcher->getArticlesFromFeed($feed);
        $this->assertCount(2, $result);
        $this->assertDatabaseHas('articles', [
            'url' => 'https://www.theguardian.com/world/2026/mar/08/first-article',
            'feed_id' => $feed->id,
        ]);
        $this->assertDatabaseHas('articles', [
            'url' => 'https://www.theguardian.com/world/2026/mar/08/second-article',
            'feed_id' => $feed->id,
        ]);
    }
    public function test_get_articles_from_rss_feed_does_not_duplicate_existing(): void
    {
        Http::fake(['*' => Http::response($this->sampleRss, 200)]);
        $feed = Feed::factory()->create([
            'type' => 'rss',
            'provider' => 'guardian',
            'url' => 'https://www.theguardian.com/international/rss',
        ]);
        Article::factory()->create([
            'url' => 'https://www.theguardian.com/world/2026/mar/08/first-article',
            'feed_id' => $feed->id,
        ]);
        $fetcher = $this->createArticleFetcher();
        $result = $fetcher->getArticlesFromFeed($feed);
        $this->assertCount(2, $result);
        $this->assertEquals(1, Article::where('url', 'https://www.theguardian.com/world/2026/mar/08/first-article')->count());
    }
    public function test_get_articles_from_rss_feed_handles_invalid_xml(): void
    {
        Http::fake(['*' => Http::response('this is not xml', 200)]);
        $feed = Feed::factory()->create([
            'type' => 'rss',
            'provider' => 'guardian',
            'url' => 'https://www.theguardian.com/international/rss',
        ]);
        $fetcher = $this->createArticleFetcher();
        $result = $fetcher->getArticlesFromFeed($feed);
        $this->assertInstanceOf(\Illuminate\Support\Collection::class, $result);
        $this->assertEmpty($result);
    }
    public function test_get_articles_from_rss_feed_handles_empty_channel(): void
    {
        Http::fake([
            '*' => Http::response('<?xml version="1.0"?><rss><channel><title>Empty</title></channel></rss>', 200),
        ]);
        $feed = Feed::factory()->create([
            'type' => 'rss',
            'provider' => 'guardian',
            'url' => 'https://www.theguardian.com/international/rss',
        ]);
        $fetcher = $this->createArticleFetcher();
        $result = $fetcher->getArticlesFromFeed($feed);
        $this->assertEmpty($result);
    }
    public function test_get_articles_from_rss_feed_handles_http_failure(): void
    {
        Http::fake(['*' => Http::response('Server Error', 500)]);
        $feed = Feed::factory()->create([
            'type' => 'rss',
            'provider' => 'guardian',
            'url' => 'https://www.theguardian.com/international/rss',
        ]);
        $fetcher = $this->createArticleFetcher();
        $result = $fetcher->getArticlesFromFeed($feed);
        $this->assertEmpty($result);
    }
    protected function tearDown(): void
    {
        Mockery::close();
        parent::tearDown();
    }
 }
--- a/tests/Unit/Services/Factories/ArticleParserFactoryTest.php
+++ b/tests/Unit/Services/Factories/ArticleParserFactoryTest.php
@ -46,9 +46,10 @@ public function test_get_supported_sources_returns_array_of_source_names(): void
        $sources = ArticleParserFactory::getSupportedSources();
        $this->assertIsArray($sources);
-        $this->assertCount(2, $sources);
+        $this->assertCount(3, $sources);
        $this->assertContains('VRT News', $sources);
        $this->assertContains('Belga News Agency', $sources);
        $this->assertContains('The Guardian', $sources);
    }
    public function test_get_supported_sources_returns_sources_in_correct_order(): void
@ -88,7 +89,7 @@ public function getSourceName(): string
        // Verify it's now included in supported sources
        $sources = ArticleParserFactory::getSupportedSources();
        $this->assertContains('TestParser', $sources);
-        $this->assertCount(3, $sources); // Original 2 + 1 new
+        $this->assertCount(4, $sources); // Original 3 + 1 new
        // Verify it can be used to parse URLs
        $testUrl = 'https://test-parser.com/article';
--- a/tests/Unit/Services/Parsers/GuardianArticlePageParserTest.php
+++ b/tests/Unit/Services/Parsers/GuardianArticlePageParserTest.php
@ -0,0 +1,285 @@
 <?php
 namespace Tests\Unit\Services\Parsers;
 use App\Services\Parsers\GuardianArticlePageParser;
 use Tests\TestCase;
 class GuardianArticlePageParserTest extends TestCase
 {
    public function test_extract_title_from_og_meta_tag(): void
    {
        $html = '<html><head><meta property="og:title" content="Guardian Article Title"/></head><body></body></html>';
        $title = GuardianArticlePageParser::extractTitle($html);
        $this->assertEquals('Guardian Article Title', $title);
    }
    public function test_extract_title_from_h1_tag(): void
    {
        $html = '<html><body><h1>H1 Title Test</h1></body></html>';
        $title = GuardianArticlePageParser::extractTitle($html);
        $this->assertEquals('H1 Title Test', $title);
    }
    public function test_extract_title_from_title_tag(): void
    {
        $html = '<html><head><title>Page Title Test</title></head><body></body></html>';
        $title = GuardianArticlePageParser::extractTitle($html);
        $this->assertEquals('Page Title Test', $title);
    }
    public function test_extract_title_with_html_entities(): void
    {
        $html = '<html><head><meta property="og:title" content="Test &amp; Article &quot;Title&quot;"/></head></html>';
        $title = GuardianArticlePageParser::extractTitle($html);
        $this->assertEquals('Test & Article "Title"', $title);
    }
    public function test_extract_title_returns_null_when_not_found(): void
    {
        $html = '<html><body><p>No title here</p></body></html>';
        $title = GuardianArticlePageParser::extractTitle($html);
        $this->assertNull($title);
    }
    public function test_extract_description_from_og_meta_tag(): void
    {
        $html = '<html><head><meta property="og:description" content="Guardian article description"/></head></html>';
        $description = GuardianArticlePageParser::extractDescription($html);
        $this->assertEquals('Guardian article description', $description);
    }
    public function test_extract_description_from_paragraph(): void
    {
        $html = '<html><body><p>This is the first paragraph description.</p></body></html>';
        $description = GuardianArticlePageParser::extractDescription($html);
        $this->assertEquals('This is the first paragraph description.', $description);
    }
    public function test_extract_description_returns_null_when_not_found(): void
    {
        $html = '<html><body><div>No description here</div></body></html>';
        $description = GuardianArticlePageParser::extractDescription($html);
        $this->assertNull($description);
    }
    public function test_extract_full_article_from_guardian_article_body(): void
    {
        $html = '
            <html>
                <body>
                    <div class="article-body-commercial-selector">
                        <p>First paragraph of the article.</p>
                        <p>Second paragraph of the article.</p>
                    </div>
                </body>
            </html>
        ';
        $fullArticle = GuardianArticlePageParser::extractFullArticle($html);
        $expected = "First paragraph of the article.\n\nSecond paragraph of the article.";
        $this->assertEquals($expected, $fullArticle);
    }
    public function test_extract_full_article_fallback_to_all_paragraphs(): void
    {
        $html = '
            <html>
                <body>
                    <p>First general paragraph.</p>
                    <p>Second general paragraph.</p>
                </body>
            </html>
        ';
        $fullArticle = GuardianArticlePageParser::extractFullArticle($html);
        $expected = "First general paragraph.\n\nSecond general paragraph.";
        $this->assertEquals($expected, $fullArticle);
    }
    public function test_extract_full_article_filters_empty_paragraphs(): void
    {
        $html = '
            <html>
                <body>
                    <div class="article-body-commercial-selector">
                        <p>Content paragraph.</p>
                        <p>   </p>
                        <p></p>
                        <p>Another content paragraph.</p>
                    </div>
                </body>
            </html>
        ';
        $fullArticle = GuardianArticlePageParser::extractFullArticle($html);
        $expected = "Content paragraph.\n\nAnother content paragraph.";
        $this->assertEquals($expected, $fullArticle);
    }
    public function test_extract_full_article_handles_nested_tags(): void
    {
        $html = '
            <html>
                <body>
                    <div class="article-body-commercial-selector">
                        <p>This has <strong>bold text</strong> and <em>italic text</em>.</p>
                        <p>This has <a href="#">a link</a> inside.</p>
                    </div>
                </body>
            </html>
        ';
        $fullArticle = GuardianArticlePageParser::extractFullArticle($html);
        $expected = "This has bold text and italic text.\n\nThis has a link inside.";
        $this->assertEquals($expected, $fullArticle);
    }
    public function test_extract_full_article_removes_scripts_and_styles(): void
    {
        $html = '
            <html>
                <head>
                    <script>console.log("test");</script>
                    <style>.test { color: red; }</style>
                </head>
                <body>
                    <div class="article-body-commercial-selector">
                        <p>Clean content.</p>
                    </div>
                    <script>alert("bad");</script>
                </body>
            </html>
        ';
        $fullArticle = GuardianArticlePageParser::extractFullArticle($html);
        $this->assertEquals('Clean content.', $fullArticle);
        $this->assertStringNotContainsString('console.log', $fullArticle);
        $this->assertStringNotContainsString('alert', $fullArticle);
    }
    public function test_extract_full_article_returns_null_when_no_content(): void
    {
        $html = '<html><body><div>No paragraphs here</div></body></html>';
        $fullArticle = GuardianArticlePageParser::extractFullArticle($html);
        $this->assertNull($fullArticle);
    }
    public function test_extract_thumbnail_from_og_image(): void
    {
        $html = '<html><head><meta property="og:image" content="https://i.guim.co.uk/img/test.jpg"/></head></html>';
        $thumbnail = GuardianArticlePageParser::extractThumbnail($html);
        $this->assertEquals('https://i.guim.co.uk/img/test.jpg', $thumbnail);
    }
    public function test_extract_thumbnail_from_img_tag(): void
    {
        $html = '<html><body><img src="https://i.guim.co.uk/img/article-image.png" alt="test"/></body></html>';
        $thumbnail = GuardianArticlePageParser::extractThumbnail($html);
        $this->assertEquals('https://i.guim.co.uk/img/article-image.png', $thumbnail);
    }
    public function test_extract_thumbnail_returns_null_when_not_found(): void
    {
        $html = '<html><body><div>No images here</div></body></html>';
        $thumbnail = GuardianArticlePageParser::extractThumbnail($html);
        $this->assertNull($thumbnail);
    }
    public function test_extract_data_returns_all_components(): void
    {
        $html = '
            <html>
                <head>
                    <meta property="og:title" content="Guardian Test Article"/>
                    <meta property="og:description" content="Test description"/>
                    <meta property="og:image" content="https://i.guim.co.uk/img/image.jpg"/>
                </head>
                <body>
                    <div class="article-body-commercial-selector">
                        <p>Full article content here.</p>
                    </div>
                </body>
            </html>
        ';
        $data = GuardianArticlePageParser::extractData($html);
        $this->assertIsArray($data);
        $this->assertArrayHasKey('title', $data);
        $this->assertArrayHasKey('description', $data);
        $this->assertArrayHasKey('full_article', $data);
        $this->assertArrayHasKey('thumbnail', $data);
        $this->assertEquals('Guardian Test Article', $data['title']);
        $this->assertEquals('Test description', $data['description']);
        $this->assertEquals('Full article content here.', $data['full_article']);
        $this->assertEquals('https://i.guim.co.uk/img/image.jpg', $data['thumbnail']);
    }
    public function test_extract_data_handles_missing_components_gracefully(): void
    {
        $html = '<html><body><div>Minimal content</div></body></html>';
        $data = GuardianArticlePageParser::extractData($html);
        $this->assertIsArray($data);
        $this->assertNull($data['title']);
        $this->assertNull($data['description']);
        $this->assertNull($data['full_article']);
        $this->assertNull($data['thumbnail']);
    }
    public function test_extract_full_article_with_realistic_guardian_html(): void
    {
        $html = '
            <html>
                <body>
                    <div class="article-body-commercial-selector">
                        <p><strong>The prime minister has announced a new climate policy that aims to reduce carbon emissions by 50% by 2030.</strong></p>
                        <p>The announcement came during a press conference at Downing Street on Tuesday afternoon.</p>
                        <p>Environmental groups have cautiously welcomed the move, while industry leaders have expressed concern about the timeline.</p>
                    </div>
                </body>
            </html>
        ';
        $fullArticle = GuardianArticlePageParser::extractFullArticle($html);
        $this->assertNotNull($fullArticle);
        $this->assertStringContainsString('climate policy', $fullArticle);
        $this->assertStringContainsString('press conference', $fullArticle);
        $this->assertStringContainsString('Environmental groups', $fullArticle);
        $this->assertStringContainsString("\n\n", $fullArticle);
        $this->assertStringNotContainsString('<strong>', $fullArticle);
    }
 }
--- a/tests/Unit/Services/Parsers/GuardianArticleParserTest.php
+++ b/tests/Unit/Services/Parsers/GuardianArticleParserTest.php
@ -0,0 +1,63 @@
 <?php
 namespace Tests\Unit\Services\Parsers;
 use App\Contracts\ArticleParserInterface;
 use App\Services\Parsers\GuardianArticleParser;
 use Tests\TestCase;
 class GuardianArticleParserTest extends TestCase
 {
    private GuardianArticleParser $parser;
    protected function setUp(): void
    {
        parent::setUp();
        $this->parser = new GuardianArticleParser();
    }
    public function test_implements_article_parser_interface(): void
    {
        $this->assertInstanceOf(ArticleParserInterface::class, $this->parser);
    }
    public function test_can_parse_guardian_url(): void
    {
        $this->assertTrue($this->parser->canParse('https://www.theguardian.com/world/2026/mar/08/some-article'));
    }
    public function test_can_parse_guardian_url_without_www(): void
    {
        $this->assertTrue($this->parser->canParse('https://theguardian.com/world/2026/mar/08/some-article'));
    }
    public function test_cannot_parse_non_guardian_url(): void
    {
        $this->assertFalse($this->parser->canParse('https://www.vrt.be/vrtnws/en/article'));
        $this->assertFalse($this->parser->canParse('https://www.belganewsagency.eu/article'));
    }
    public function test_get_source_name(): void
    {
        $this->assertEquals('The Guardian', $this->parser->getSourceName());
    }
    public function test_extract_data_delegates_to_page_parser(): void
    {
        $html = '
            <html>
                <head>
                    <meta property="og:title" content="Test Title"/>
                    <meta property="og:description" content="Test Description"/>
                </head>
                <body><p>Content</p></body>
            </html>
        ';
        $data = $this->parser->extractData($html);
        $this->assertIsArray($data);
        $this->assertArrayHasKey('title', $data);
        $this->assertEquals('Test Title', $data['title']);
    }
 }