From 1e39a25f83c9123fde4ade54f12af6ecb22d1952 Mon Sep 17 00:00:00 2001
From: myrmidex <myrmidex@myrmidex.net>
Date: Sun, 8 Mar 2026 11:02:46 +0100
Subject: [PATCH] 37 - Add The Guardian as RSS feed provider, implement RSS
 parsing

---
 app/Http/Requests/StoreFeedRequest.php        |   6 +-
 app/Services/Article/ArticleFetcher.php       |  42 ++-
 .../Factories/ArticleParserFactory.php        |   2 +
 .../Parsers/GuardianArticlePageParser.php     | 110 +++++++
 .../Parsers/GuardianArticleParser.php         |  23 ++
 config/feed.php                               |  16 +-
 .../Api/V1/FeedsControllerTest.php            |  34 ++-
 tests/Unit/Actions/CreateFeedActionTest.php   |  14 +-
 tests/Unit/Services/ArticleFetcherRssTest.php | 164 ++++++++++
 .../Factories/ArticleParserFactoryTest.php    |   5 +-
 .../Parsers/GuardianArticlePageParserTest.php | 285 ++++++++++++++++++
 .../Parsers/GuardianArticleParserTest.php     |  63 ++++
 12 files changed, 754 insertions(+), 10 deletions(-)
 create mode 100644 app/Services/Parsers/GuardianArticlePageParser.php
 create mode 100644 app/Services/Parsers/GuardianArticleParser.php
 create mode 100644 tests/Unit/Services/ArticleFetcherRssTest.php
 create mode 100644 tests/Unit/Services/Parsers/GuardianArticlePageParserTest.php
 create mode 100644 tests/Unit/Services/Parsers/GuardianArticleParserTest.php

diff --git a/app/Http/Requests/StoreFeedRequest.php b/app/Http/Requests/StoreFeedRequest.php
index a49570c..ac2e533 100644
--- a/app/Http/Requests/StoreFeedRequest.php
+++ b/app/Http/Requests/StoreFeedRequest.php
@@ -12,13 +12,15 @@ public function authorize(): bool
     }
 
     /**
-     * @return array<string, string>
+     * @return array<string, mixed>
      */
     public function rules(): array
     {
+        $providers = implode(',', array_keys(config('feed.providers', [])));
+
         return [
             'name' => 'required|string|max:255',
-            'provider' => 'required|in:vrt,belga',
+            'provider' => "required|in:{$providers}",
             'language_id' => 'required|exists:languages,id',
             'description' => 'nullable|string',
             'is_active' => 'boolean'
diff --git a/app/Services/Article/ArticleFetcher.php b/app/Services/Article/ArticleFetcher.php
index 44124c4..6669a7c 100644
--- a/app/Services/Article/ArticleFetcher.php
+++ b/app/Services/Article/ArticleFetcher.php
@@ -41,9 +41,45 @@ public function getArticlesFromFeed(Feed $feed): Collection
      */
     private function getArticlesFromRssFeed(Feed $feed): Collection
     {
-        // TODO: Implement RSS feed parsing
-        // For now, return empty collection
-        return collect();
+        try {
+            $xml = HttpFetcher::fetchHtml($feed->url);
+
+            $previousUseErrors = libxml_use_internal_errors(true);
+
+            try {
+                $rss = simplexml_load_string($xml);
+            } finally {
+                libxml_clear_errors();
+                libxml_use_internal_errors($previousUseErrors);
+            }
+
+            if ($rss === false || !isset($rss->channel->item)) {
+                $this->logSaver->warning("Failed to parse RSS feed XML", null, [
+                    'feed_id' => $feed->id,
+                    'feed_url' => $feed->url,
+                ]);
+
+                return collect();
+            }
+
+            $articles = collect();
+            foreach ($rss->channel->item as $item) {
+                $link = (string) $item->link;
+                if ($link !== '') {
+                    $articles->push($this->saveArticle($link, $feed->id));
+                }
+            }
+
+            return $articles;
+        } catch (Exception $e) {
+            $this->logSaver->error("Failed to fetch articles from RSS feed", null, [
+                'feed_id' => $feed->id,
+                'feed_url' => $feed->url,
+                'error' => $e->getMessage(),
+            ]);
+
+            return collect();
+        }
     }
 
     /**
diff --git a/app/Services/Factories/ArticleParserFactory.php b/app/Services/Factories/ArticleParserFactory.php
index 765994a..cfef7b3 100644
--- a/app/Services/Factories/ArticleParserFactory.php
+++ b/app/Services/Factories/ArticleParserFactory.php
@@ -6,6 +6,7 @@
 use App\Models\Feed;
 use App\Services\Parsers\VrtArticleParser;
 use App\Services\Parsers\BelgaArticleParser;
+use App\Services\Parsers\GuardianArticleParser;
 use Exception;
 
 class ArticleParserFactory
@@ -16,6 +17,7 @@ class ArticleParserFactory
     private static array $parsers = [
         VrtArticleParser::class,
         BelgaArticleParser::class,
+        GuardianArticleParser::class,
     ];
 
     /**
diff --git a/app/Services/Parsers/GuardianArticlePageParser.php b/app/Services/Parsers/GuardianArticlePageParser.php
new file mode 100644
index 0000000..7f94570
--- /dev/null
+++ b/app/Services/Parsers/GuardianArticlePageParser.php
@@ -0,0 +1,110 @@
+<?php
+
+namespace App\Services\Parsers;
+
+class GuardianArticlePageParser
+{
+    public static function extractTitle(string $html): ?string
+    {
+        // Try meta title first
+        if (preg_match('/<meta property="og:title" content="([^"]+)"/i', $html, $matches)) {
+            return html_entity_decode($matches[1], ENT_QUOTES, 'UTF-8');
+        }
+
+        // Try any h1 tag
+        if (preg_match('/<h1[^>]*>([^<]+)<\/h1>/i', $html, $matches)) {
+            return html_entity_decode(strip_tags($matches[1]), ENT_QUOTES, 'UTF-8');
+        }
+
+        // Try title tag
+        if (preg_match('/<title>([^<]+)<\/title>/i', $html, $matches)) {
+            return html_entity_decode($matches[1], ENT_QUOTES, 'UTF-8');
+        }
+
+        return null;
+    }
+
+    public static function extractDescription(string $html): ?string
+    {
+        // Try meta description first
+        if (preg_match('/<meta property="og:description" content="([^"]+)"/i', $html, $matches)) {
+            return html_entity_decode($matches[1], ENT_QUOTES, 'UTF-8');
+        }
+
+        // Try first paragraph
+        if (preg_match('/<p[^>]*>([^<]+(?:<[^\/](?!p)[^>]*>[^<]*<\/[^>]*>[^<]*)*)<\/p>/i', $html, $matches)) {
+            return html_entity_decode(strip_tags($matches[1]), ENT_QUOTES, 'UTF-8');
+        }
+
+        return null;
+    }
+
+    public static function extractFullArticle(string $html): ?string
+    {
+        // Remove scripts, styles, and other non-content elements
+        $cleanHtml = preg_replace('/<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/mi', '', $html);
+        $cleanHtml = preg_replace('/<style\b[^<]*(?:(?!<\/style>)<[^<]*)*<\/style>/mi', '', $cleanHtml);
+
+        // Try Guardian-specific article body container (greedy to avoid stopping at nested divs)
+        if (preg_match('/<div[^>]*class="[^"]*article-body-commercial-selector[^"]*"[^>]*>(.*)<\/div>/is', $cleanHtml, $sectionMatches)) {
+            $sectionHtml = $sectionMatches[1];
+            preg_match_all('/<p[^>]*>(.*?)<\/p>/is', $sectionHtml, $matches);
+
+            if (!empty($matches[1])) {
+                return self::joinParagraphs($matches[1]);
+            }
+        }
+
+        // Fallback: extract all paragraph content
+        preg_match_all('/<p[^>]*>(.*?)<\/p>/is', $cleanHtml, $matches);
+        if (!empty($matches[1])) {
+            return self::joinParagraphs($matches[1]);
+        }
+
+        return null;
+    }
+
+    public static function extractThumbnail(string $html): ?string
+    {
+        // Try OpenGraph image first
+        if (preg_match('/<meta property="og:image" content="([^"]+)"/i', $html, $matches)) {
+            return $matches[1];
+        }
+
+        // Try first image in content
+        if (preg_match('/<img[^>]+src="([^"]+)"/i', $html, $matches)) {
+            return $matches[1];
+        }
+
+        return null;
+    }
+
+    /**
+     * @return array<string, string|null>
+     */
+    public static function extractData(string $html): array
+    {
+        return [
+            'title' => self::extractTitle($html),
+            'description' => self::extractDescription($html),
+            'full_article' => self::extractFullArticle($html),
+            'thumbnail' => self::extractThumbnail($html),
+        ];
+    }
+
+    /**
+     * @param array<int, string> $paragraphs
+     */
+    private static function joinParagraphs(array $paragraphs): ?string
+    {
+        $paragraphs = array_map(function ($paragraph) {
+            return html_entity_decode(strip_tags($paragraph), ENT_QUOTES, 'UTF-8');
+        }, $paragraphs);
+
+        $fullText = implode("\n\n", array_filter($paragraphs, function ($p) {
+            return trim($p) !== '';
+        }));
+
+        return $fullText ?: null;
+    }
+}
\ No newline at end of file
diff --git a/app/Services/Parsers/GuardianArticleParser.php b/app/Services/Parsers/GuardianArticleParser.php
new file mode 100644
index 0000000..a363199
--- /dev/null
+++ b/app/Services/Parsers/GuardianArticleParser.php
@@ -0,0 +1,23 @@
+<?php
+
+namespace App\Services\Parsers;
+
+use App\Contracts\ArticleParserInterface;
+
+class GuardianArticleParser implements ArticleParserInterface
+{
+    public function canParse(string $url): bool
+    {
+        return str_contains($url, 'theguardian.com');
+    }
+
+    public function extractData(string $html): array
+    {
+        return GuardianArticlePageParser::extractData($html);
+    }
+
+    public function getSourceName(): string
+    {
+        return 'The Guardian';
+    }
+}
\ No newline at end of file
diff --git a/config/feed.php b/config/feed.php
index b6bdfd7..4ddb9ea 100644
--- a/config/feed.php
+++ b/config/feed.php
@@ -33,7 +33,7 @@
             'code' => 'belga',
             'name' => 'Belga News Agency',
             'description' => 'Belgian national news agency',
-            'type' => 'rss',
+            'type' => 'website',
             'is_active' => true,
             'languages' => [
                 'en' => ['url' => 'https://www.belganewsagency.eu/'],
@@ -44,6 +44,20 @@
                 'article_page' => \App\Services\Parsers\BelgaArticlePageParser::class,
             ],
         ],
+        'guardian' => [
+            'code' => 'guardian',
+            'name' => 'The Guardian',
+            'description' => 'British daily newspaper',
+            'type' => 'rss',
+            'is_active' => true,
+            'languages' => [
+                'en' => ['url' => 'https://www.theguardian.com/international/rss'],
+            ],
+            'parsers' => [
+                'article' => \App\Services\Parsers\GuardianArticleParser::class,
+                'article_page' => \App\Services\Parsers\GuardianArticlePageParser::class,
+            ],
+        ],
     ],
 
     /*
diff --git a/tests/Feature/Http/Controllers/Api/V1/FeedsControllerTest.php b/tests/Feature/Http/Controllers/Api/V1/FeedsControllerTest.php
index 25c2695..143a8f5 100644
--- a/tests/Feature/Http/Controllers/Api/V1/FeedsControllerTest.php
+++ b/tests/Feature/Http/Controllers/Api/V1/FeedsControllerTest.php
@@ -99,7 +99,7 @@ public function test_store_creates_belga_feed_successfully(): void
                 'data' => [
                     'name' => 'Belga Test Feed',
                     'url' => 'https://www.belganewsagency.eu/',
-                    'type' => 'rss',
+                    'type' => 'website',
                     'is_active' => true,
                 ]
             ]);
@@ -107,6 +107,38 @@ public function test_store_creates_belga_feed_successfully(): void
         $this->assertDatabaseHas('feeds', [
             'name' => 'Belga Test Feed',
             'url' => 'https://www.belganewsagency.eu/',
+            'type' => 'website',
+        ]);
+    }
+
+    public function test_store_creates_guardian_feed_successfully(): void
+    {
+        $language = Language::factory()->english()->create();
+
+        $feedData = [
+            'name' => 'Guardian Test Feed',
+            'provider' => 'guardian',
+            'language_id' => $language->id,
+            'is_active' => true,
+        ];
+
+        $response = $this->postJson('/api/v1/feeds', $feedData);
+
+        $response->assertStatus(201)
+            ->assertJson([
+                'success' => true,
+                'message' => 'Feed created successfully!',
+                'data' => [
+                    'name' => 'Guardian Test Feed',
+                    'url' => 'https://www.theguardian.com/international/rss',
+                    'type' => 'rss',
+                    'is_active' => true,
+                ]
+            ]);
+
+        $this->assertDatabaseHas('feeds', [
+            'name' => 'Guardian Test Feed',
+            'url' => 'https://www.theguardian.com/international/rss',
             'type' => 'rss',
         ]);
     }
diff --git a/tests/Unit/Actions/CreateFeedActionTest.php b/tests/Unit/Actions/CreateFeedActionTest.php
index 072bdab..5c4eec6 100644
--- a/tests/Unit/Actions/CreateFeedActionTest.php
+++ b/tests/Unit/Actions/CreateFeedActionTest.php
@@ -43,11 +43,23 @@ public function test_creates_belga_feed_with_correct_url(): void
         $feed = $this->action->execute('Belga News', 'belga', $language->id);
 
         $this->assertEquals('https://www.belganewsagency.eu/', $feed->url);
-        $this->assertEquals('rss', $feed->type);
+        $this->assertEquals('website', $feed->type);
         $this->assertEquals('belga', $feed->provider);
         $this->assertNull($feed->description);
     }
 
+    public function test_creates_guardian_feed_with_correct_url(): void
+    {
+        $language = Language::factory()->create(['short_code' => 'en', 'is_active' => true]);
+
+        $feed = $this->action->execute('Guardian News', 'guardian', $language->id);
+
+        $this->assertEquals('https://www.theguardian.com/international/rss', $feed->url);
+        $this->assertEquals('rss', $feed->type);
+        $this->assertEquals('guardian', $feed->provider);
+        $this->assertNull($feed->description);
+    }
+
     public function test_creates_vrt_feed_with_dutch_language(): void
     {
         $language = Language::factory()->create(['short_code' => 'nl', 'is_active' => true]);
diff --git a/tests/Unit/Services/ArticleFetcherRssTest.php b/tests/Unit/Services/ArticleFetcherRssTest.php
new file mode 100644
index 0000000..0479d6a
--- /dev/null
+++ b/tests/Unit/Services/ArticleFetcherRssTest.php
@@ -0,0 +1,164 @@
+<?php
+
+namespace Tests\Unit\Services;
+
+use App\Models\Article;
+use App\Models\Feed;
+use Illuminate\Foundation\Testing\RefreshDatabase;
+use Illuminate\Support\Facades\Http;
+use Mockery;
+use Tests\TestCase;
+use Tests\Traits\CreatesArticleFetcher;
+
+class ArticleFetcherRssTest extends TestCase
+{
+    use RefreshDatabase, CreatesArticleFetcher;
+
+    private string $sampleRss;
+
+    protected function setUp(): void
+    {
+        parent::setUp();
+
+        $this->sampleRss = <<<'XML'
+<?xml version="1.0" encoding="UTF-8"?>
+<rss version="2.0">
+    <channel>
+        <title>The Guardian - International</title>
+        <link>https://www.theguardian.com/international</link>
+        <item>
+            <title>First Article Title</title>
+            <link>https://www.theguardian.com/world/2026/mar/08/first-article</link>
+            <description>First article description</description>
+            <pubDate>Sun, 08 Mar 2026 12:00:00 GMT</pubDate>
+        </item>
+        <item>
+            <title>Second Article Title</title>
+            <link>https://www.theguardian.com/world/2026/mar/08/second-article</link>
+            <description>Second article description</description>
+            <pubDate>Sun, 08 Mar 2026 11:00:00 GMT</pubDate>
+        </item>
+    </channel>
+</rss>
+XML;
+    }
+
+    public function test_get_articles_from_rss_feed_returns_collection(): void
+    {
+        Http::fake(['*' => Http::response($this->sampleRss, 200)]);
+
+        $feed = Feed::factory()->create([
+            'type' => 'rss',
+            'provider' => 'guardian',
+            'url' => 'https://www.theguardian.com/international/rss',
+        ]);
+
+        $fetcher = $this->createArticleFetcher();
+        $result = $fetcher->getArticlesFromFeed($feed);
+
+        $this->assertInstanceOf(\Illuminate\Support\Collection::class, $result);
+    }
+
+    public function test_get_articles_from_rss_feed_creates_articles(): void
+    {
+        Http::fake(['*' => Http::response($this->sampleRss, 200)]);
+
+        $feed = Feed::factory()->create([
+            'type' => 'rss',
+            'provider' => 'guardian',
+            'url' => 'https://www.theguardian.com/international/rss',
+        ]);
+
+        $fetcher = $this->createArticleFetcher();
+        $result = $fetcher->getArticlesFromFeed($feed);
+
+        $this->assertCount(2, $result);
+        $this->assertDatabaseHas('articles', [
+            'url' => 'https://www.theguardian.com/world/2026/mar/08/first-article',
+            'feed_id' => $feed->id,
+        ]);
+        $this->assertDatabaseHas('articles', [
+            'url' => 'https://www.theguardian.com/world/2026/mar/08/second-article',
+            'feed_id' => $feed->id,
+        ]);
+    }
+
+    public function test_get_articles_from_rss_feed_does_not_duplicate_existing(): void
+    {
+        Http::fake(['*' => Http::response($this->sampleRss, 200)]);
+
+        $feed = Feed::factory()->create([
+            'type' => 'rss',
+            'provider' => 'guardian',
+            'url' => 'https://www.theguardian.com/international/rss',
+        ]);
+
+        Article::factory()->create([
+            'url' => 'https://www.theguardian.com/world/2026/mar/08/first-article',
+            'feed_id' => $feed->id,
+        ]);
+
+        $fetcher = $this->createArticleFetcher();
+        $result = $fetcher->getArticlesFromFeed($feed);
+
+        $this->assertCount(2, $result);
+        $this->assertEquals(1, Article::where('url', 'https://www.theguardian.com/world/2026/mar/08/first-article')->count());
+    }
+
+    public function test_get_articles_from_rss_feed_handles_invalid_xml(): void
+    {
+        Http::fake(['*' => Http::response('this is not xml', 200)]);
+
+        $feed = Feed::factory()->create([
+            'type' => 'rss',
+            'provider' => 'guardian',
+            'url' => 'https://www.theguardian.com/international/rss',
+        ]);
+
+        $fetcher = $this->createArticleFetcher();
+        $result = $fetcher->getArticlesFromFeed($feed);
+
+        $this->assertInstanceOf(\Illuminate\Support\Collection::class, $result);
+        $this->assertEmpty($result);
+    }
+
+    public function test_get_articles_from_rss_feed_handles_empty_channel(): void
+    {
+        Http::fake([
+            '*' => Http::response('<?xml version="1.0"?><rss><channel><title>Empty</title></channel></rss>', 200),
+        ]);
+
+        $feed = Feed::factory()->create([
+            'type' => 'rss',
+            'provider' => 'guardian',
+            'url' => 'https://www.theguardian.com/international/rss',
+        ]);
+
+        $fetcher = $this->createArticleFetcher();
+        $result = $fetcher->getArticlesFromFeed($feed);
+
+        $this->assertEmpty($result);
+    }
+
+    public function test_get_articles_from_rss_feed_handles_http_failure(): void
+    {
+        Http::fake(['*' => Http::response('Server Error', 500)]);
+
+        $feed = Feed::factory()->create([
+            'type' => 'rss',
+            'provider' => 'guardian',
+            'url' => 'https://www.theguardian.com/international/rss',
+        ]);
+
+        $fetcher = $this->createArticleFetcher();
+        $result = $fetcher->getArticlesFromFeed($feed);
+
+        $this->assertEmpty($result);
+    }
+
+    protected function tearDown(): void
+    {
+        Mockery::close();
+        parent::tearDown();
+    }
+}
\ No newline at end of file
diff --git a/tests/Unit/Services/Factories/ArticleParserFactoryTest.php b/tests/Unit/Services/Factories/ArticleParserFactoryTest.php
index a0a9808..ef5a24a 100644
--- a/tests/Unit/Services/Factories/ArticleParserFactoryTest.php
+++ b/tests/Unit/Services/Factories/ArticleParserFactoryTest.php
@@ -46,9 +46,10 @@ public function test_get_supported_sources_returns_array_of_source_names(): void
         $sources = ArticleParserFactory::getSupportedSources();
 
         $this->assertIsArray($sources);
-        $this->assertCount(2, $sources);
+        $this->assertCount(3, $sources);
         $this->assertContains('VRT News', $sources);
         $this->assertContains('Belga News Agency', $sources);
+        $this->assertContains('The Guardian', $sources);
     }
 
     public function test_get_supported_sources_returns_sources_in_correct_order(): void
@@ -88,7 +89,7 @@ public function getSourceName(): string
         // Verify it's now included in supported sources
         $sources = ArticleParserFactory::getSupportedSources();
         $this->assertContains('TestParser', $sources);
-        $this->assertCount(3, $sources); // Original 2 + 1 new
+        $this->assertCount(4, $sources); // Original 3 + 1 new
 
         // Verify it can be used to parse URLs
         $testUrl = 'https://test-parser.com/article';
diff --git a/tests/Unit/Services/Parsers/GuardianArticlePageParserTest.php b/tests/Unit/Services/Parsers/GuardianArticlePageParserTest.php
new file mode 100644
index 0000000..a0126d0
--- /dev/null
+++ b/tests/Unit/Services/Parsers/GuardianArticlePageParserTest.php
@@ -0,0 +1,285 @@
+<?php
+
+namespace Tests\Unit\Services\Parsers;
+
+use App\Services\Parsers\GuardianArticlePageParser;
+use Tests\TestCase;
+
+class GuardianArticlePageParserTest extends TestCase
+{
+    public function test_extract_title_from_og_meta_tag(): void
+    {
+        $html = '<html><head><meta property="og:title" content="Guardian Article Title"/></head><body></body></html>';
+
+        $title = GuardianArticlePageParser::extractTitle($html);
+
+        $this->assertEquals('Guardian Article Title', $title);
+    }
+
+    public function test_extract_title_from_h1_tag(): void
+    {
+        $html = '<html><body><h1>H1 Title Test</h1></body></html>';
+
+        $title = GuardianArticlePageParser::extractTitle($html);
+
+        $this->assertEquals('H1 Title Test', $title);
+    }
+
+    public function test_extract_title_from_title_tag(): void
+    {
+        $html = '<html><head><title>Page Title Test</title></head><body></body></html>';
+
+        $title = GuardianArticlePageParser::extractTitle($html);
+
+        $this->assertEquals('Page Title Test', $title);
+    }
+
+    public function test_extract_title_with_html_entities(): void
+    {
+        $html = '<html><head><meta property="og:title" content="Test &amp; Article &quot;Title&quot;"/></head></html>';
+
+        $title = GuardianArticlePageParser::extractTitle($html);
+
+        $this->assertEquals('Test & Article "Title"', $title);
+    }
+
+    public function test_extract_title_returns_null_when_not_found(): void
+    {
+        $html = '<html><body><p>No title here</p></body></html>';
+
+        $title = GuardianArticlePageParser::extractTitle($html);
+
+        $this->assertNull($title);
+    }
+
+    public function test_extract_description_from_og_meta_tag(): void
+    {
+        $html = '<html><head><meta property="og:description" content="Guardian article description"/></head></html>';
+
+        $description = GuardianArticlePageParser::extractDescription($html);
+
+        $this->assertEquals('Guardian article description', $description);
+    }
+
+    public function test_extract_description_from_paragraph(): void
+    {
+        $html = '<html><body><p>This is the first paragraph description.</p></body></html>';
+
+        $description = GuardianArticlePageParser::extractDescription($html);
+
+        $this->assertEquals('This is the first paragraph description.', $description);
+    }
+
+    public function test_extract_description_returns_null_when_not_found(): void
+    {
+        $html = '<html><body><div>No description here</div></body></html>';
+
+        $description = GuardianArticlePageParser::extractDescription($html);
+
+        $this->assertNull($description);
+    }
+
+    public function test_extract_full_article_from_guardian_article_body(): void
+    {
+        $html = '
+            <html>
+                <body>
+                    <div class="article-body-commercial-selector">
+                        <p>First paragraph of the article.</p>
+                        <p>Second paragraph of the article.</p>
+                    </div>
+                </body>
+            </html>
+        ';
+
+        $fullArticle = GuardianArticlePageParser::extractFullArticle($html);
+
+        $expected = "First paragraph of the article.\n\nSecond paragraph of the article.";
+        $this->assertEquals($expected, $fullArticle);
+    }
+
+    public function test_extract_full_article_fallback_to_all_paragraphs(): void
+    {
+        $html = '
+            <html>
+                <body>
+                    <p>First general paragraph.</p>
+                    <p>Second general paragraph.</p>
+                </body>
+            </html>
+        ';
+
+        $fullArticle = GuardianArticlePageParser::extractFullArticle($html);
+
+        $expected = "First general paragraph.\n\nSecond general paragraph.";
+        $this->assertEquals($expected, $fullArticle);
+    }
+
+    public function test_extract_full_article_filters_empty_paragraphs(): void
+    {
+        $html = '
+            <html>
+                <body>
+                    <div class="article-body-commercial-selector">
+                        <p>Content paragraph.</p>
+                        <p>   </p>
+                        <p></p>
+                        <p>Another content paragraph.</p>
+                    </div>
+                </body>
+            </html>
+        ';
+
+        $fullArticle = GuardianArticlePageParser::extractFullArticle($html);
+
+        $expected = "Content paragraph.\n\nAnother content paragraph.";
+        $this->assertEquals($expected, $fullArticle);
+    }
+
+    public function test_extract_full_article_handles_nested_tags(): void
+    {
+        $html = '
+            <html>
+                <body>
+                    <div class="article-body-commercial-selector">
+                        <p>This has <strong>bold text</strong> and <em>italic text</em>.</p>
+                        <p>This has <a href="#">a link</a> inside.</p>
+                    </div>
+                </body>
+            </html>
+        ';
+
+        $fullArticle = GuardianArticlePageParser::extractFullArticle($html);
+
+        $expected = "This has bold text and italic text.\n\nThis has a link inside.";
+        $this->assertEquals($expected, $fullArticle);
+    }
+
+    public function test_extract_full_article_removes_scripts_and_styles(): void
+    {
+        $html = '
+            <html>
+                <head>
+                    <script>console.log("test");</script>
+                    <style>.test { color: red; }</style>
+                </head>
+                <body>
+                    <div class="article-body-commercial-selector">
+                        <p>Clean content.</p>
+                    </div>
+                    <script>alert("bad");</script>
+                </body>
+            </html>
+        ';
+
+        $fullArticle = GuardianArticlePageParser::extractFullArticle($html);
+
+        $this->assertEquals('Clean content.', $fullArticle);
+        $this->assertStringNotContainsString('console.log', $fullArticle);
+        $this->assertStringNotContainsString('alert', $fullArticle);
+    }
+
+    public function test_extract_full_article_returns_null_when_no_content(): void
+    {
+        $html = '<html><body><div>No paragraphs here</div></body></html>';
+
+        $fullArticle = GuardianArticlePageParser::extractFullArticle($html);
+
+        $this->assertNull($fullArticle);
+    }
+
+    public function test_extract_thumbnail_from_og_image(): void
+    {
+        $html = '<html><head><meta property="og:image" content="https://i.guim.co.uk/img/test.jpg"/></head></html>';
+
+        $thumbnail = GuardianArticlePageParser::extractThumbnail($html);
+
+        $this->assertEquals('https://i.guim.co.uk/img/test.jpg', $thumbnail);
+    }
+
+    public function test_extract_thumbnail_from_img_tag(): void
+    {
+        $html = '<html><body><img src="https://i.guim.co.uk/img/article-image.png" alt="test"/></body></html>';
+
+        $thumbnail = GuardianArticlePageParser::extractThumbnail($html);
+
+        $this->assertEquals('https://i.guim.co.uk/img/article-image.png', $thumbnail);
+    }
+
+    public function test_extract_thumbnail_returns_null_when_not_found(): void
+    {
+        $html = '<html><body><div>No images here</div></body></html>';
+
+        $thumbnail = GuardianArticlePageParser::extractThumbnail($html);
+
+        $this->assertNull($thumbnail);
+    }
+
+    public function test_extract_data_returns_all_components(): void
+    {
+        $html = '
+            <html>
+                <head>
+                    <meta property="og:title" content="Guardian Test Article"/>
+                    <meta property="og:description" content="Test description"/>
+                    <meta property="og:image" content="https://i.guim.co.uk/img/image.jpg"/>
+                </head>
+                <body>
+                    <div class="article-body-commercial-selector">
+                        <p>Full article content here.</p>
+                    </div>
+                </body>
+            </html>
+        ';
+
+        $data = GuardianArticlePageParser::extractData($html);
+
+        $this->assertIsArray($data);
+        $this->assertArrayHasKey('title', $data);
+        $this->assertArrayHasKey('description', $data);
+        $this->assertArrayHasKey('full_article', $data);
+        $this->assertArrayHasKey('thumbnail', $data);
+
+        $this->assertEquals('Guardian Test Article', $data['title']);
+        $this->assertEquals('Test description', $data['description']);
+        $this->assertEquals('Full article content here.', $data['full_article']);
+        $this->assertEquals('https://i.guim.co.uk/img/image.jpg', $data['thumbnail']);
+    }
+
+    public function test_extract_data_handles_missing_components_gracefully(): void
+    {
+        $html = '<html><body><div>Minimal content</div></body></html>';
+
+        $data = GuardianArticlePageParser::extractData($html);
+
+        $this->assertIsArray($data);
+        $this->assertNull($data['title']);
+        $this->assertNull($data['description']);
+        $this->assertNull($data['full_article']);
+        $this->assertNull($data['thumbnail']);
+    }
+
+    public function test_extract_full_article_with_realistic_guardian_html(): void
+    {
+        $html = '
+            <html>
+                <body>
+                    <div class="article-body-commercial-selector">
+                        <p><strong>The prime minister has announced a new climate policy that aims to reduce carbon emissions by 50% by 2030.</strong></p>
+                        <p>The announcement came during a press conference at Downing Street on Tuesday afternoon.</p>
+                        <p>Environmental groups have cautiously welcomed the move, while industry leaders have expressed concern about the timeline.</p>
+                    </div>
+                </body>
+            </html>
+        ';
+
+        $fullArticle = GuardianArticlePageParser::extractFullArticle($html);
+
+        $this->assertNotNull($fullArticle);
+        $this->assertStringContainsString('climate policy', $fullArticle);
+        $this->assertStringContainsString('press conference', $fullArticle);
+        $this->assertStringContainsString('Environmental groups', $fullArticle);
+        $this->assertStringContainsString("\n\n", $fullArticle);
+        $this->assertStringNotContainsString('<strong>', $fullArticle);
+    }
+}
\ No newline at end of file
diff --git a/tests/Unit/Services/Parsers/GuardianArticleParserTest.php b/tests/Unit/Services/Parsers/GuardianArticleParserTest.php
new file mode 100644
index 0000000..51bcd43
--- /dev/null
+++ b/tests/Unit/Services/Parsers/GuardianArticleParserTest.php
@@ -0,0 +1,63 @@
+<?php
+
+namespace Tests\Unit\Services\Parsers;
+
+use App\Contracts\ArticleParserInterface;
+use App\Services\Parsers\GuardianArticleParser;
+use Tests\TestCase;
+
+class GuardianArticleParserTest extends TestCase
+{
+    private GuardianArticleParser $parser;
+
+    protected function setUp(): void
+    {
+        parent::setUp();
+        $this->parser = new GuardianArticleParser();
+    }
+
+    public function test_implements_article_parser_interface(): void
+    {
+        $this->assertInstanceOf(ArticleParserInterface::class, $this->parser);
+    }
+
+    public function test_can_parse_guardian_url(): void
+    {
+        $this->assertTrue($this->parser->canParse('https://www.theguardian.com/world/2026/mar/08/some-article'));
+    }
+
+    public function test_can_parse_guardian_url_without_www(): void
+    {
+        $this->assertTrue($this->parser->canParse('https://theguardian.com/world/2026/mar/08/some-article'));
+    }
+
+    public function test_cannot_parse_non_guardian_url(): void
+    {
+        $this->assertFalse($this->parser->canParse('https://www.vrt.be/vrtnws/en/article'));
+        $this->assertFalse($this->parser->canParse('https://www.belganewsagency.eu/article'));
+    }
+
+    public function test_get_source_name(): void
+    {
+        $this->assertEquals('The Guardian', $this->parser->getSourceName());
+    }
+
+    public function test_extract_data_delegates_to_page_parser(): void
+    {
+        $html = '
+            <html>
+                <head>
+                    <meta property="og:title" content="Test Title"/>
+                    <meta property="og:description" content="Test Description"/>
+                </head>
+                <body><p>Content</p></body>
+            </html>
+        ';
+
+        $data = $this->parser->extractData($html);
+
+        $this->assertIsArray($data);
+        $this->assertArrayHasKey('title', $data);
+        $this->assertEquals('Test Title', $data['title']);
+    }
+}
\ No newline at end of file