From 1e39a25f83c9123fde4ade54f12af6ecb22d1952 Mon Sep 17 00:00:00 2001 From: myrmidex Date: Sun, 8 Mar 2026 11:02:46 +0100 Subject: [PATCH] 37 - Add The Guardian as RSS feed provider, implement RSS parsing --- app/Http/Requests/StoreFeedRequest.php | 6 +- app/Services/Article/ArticleFetcher.php | 42 ++- .../Factories/ArticleParserFactory.php | 2 + .../Parsers/GuardianArticlePageParser.php | 110 +++++++ .../Parsers/GuardianArticleParser.php | 23 ++ config/feed.php | 16 +- .../Api/V1/FeedsControllerTest.php | 34 ++- tests/Unit/Actions/CreateFeedActionTest.php | 14 +- tests/Unit/Services/ArticleFetcherRssTest.php | 164 ++++++++++ .../Factories/ArticleParserFactoryTest.php | 5 +- .../Parsers/GuardianArticlePageParserTest.php | 285 ++++++++++++++++++ .../Parsers/GuardianArticleParserTest.php | 63 ++++ 12 files changed, 754 insertions(+), 10 deletions(-) create mode 100644 app/Services/Parsers/GuardianArticlePageParser.php create mode 100644 app/Services/Parsers/GuardianArticleParser.php create mode 100644 tests/Unit/Services/ArticleFetcherRssTest.php create mode 100644 tests/Unit/Services/Parsers/GuardianArticlePageParserTest.php create mode 100644 tests/Unit/Services/Parsers/GuardianArticleParserTest.php diff --git a/app/Http/Requests/StoreFeedRequest.php b/app/Http/Requests/StoreFeedRequest.php index a49570c..ac2e533 100644 --- a/app/Http/Requests/StoreFeedRequest.php +++ b/app/Http/Requests/StoreFeedRequest.php @@ -12,13 +12,15 @@ public function authorize(): bool } /** - * @return array + * @return array */ public function rules(): array { + $providers = implode(',', array_keys(config('feed.providers', []))); + return [ 'name' => 'required|string|max:255', - 'provider' => 'required|in:vrt,belga', + 'provider' => "required|in:{$providers}", 'language_id' => 'required|exists:languages,id', 'description' => 'nullable|string', 'is_active' => 'boolean' diff --git a/app/Services/Article/ArticleFetcher.php b/app/Services/Article/ArticleFetcher.php index 44124c4..6669a7c 100644 --- a/app/Services/Article/ArticleFetcher.php +++ b/app/Services/Article/ArticleFetcher.php @@ -41,9 +41,45 @@ public function getArticlesFromFeed(Feed $feed): Collection */ private function getArticlesFromRssFeed(Feed $feed): Collection { - // TODO: Implement RSS feed parsing - // For now, return empty collection - return collect(); + try { + $xml = HttpFetcher::fetchHtml($feed->url); + + $previousUseErrors = libxml_use_internal_errors(true); + + try { + $rss = simplexml_load_string($xml); + } finally { + libxml_clear_errors(); + libxml_use_internal_errors($previousUseErrors); + } + + if ($rss === false || !isset($rss->channel->item)) { + $this->logSaver->warning("Failed to parse RSS feed XML", null, [ + 'feed_id' => $feed->id, + 'feed_url' => $feed->url, + ]); + + return collect(); + } + + $articles = collect(); + foreach ($rss->channel->item as $item) { + $link = (string) $item->link; + if ($link !== '') { + $articles->push($this->saveArticle($link, $feed->id)); + } + } + + return $articles; + } catch (Exception $e) { + $this->logSaver->error("Failed to fetch articles from RSS feed", null, [ + 'feed_id' => $feed->id, + 'feed_url' => $feed->url, + 'error' => $e->getMessage(), + ]); + + return collect(); + } } /** diff --git a/app/Services/Factories/ArticleParserFactory.php b/app/Services/Factories/ArticleParserFactory.php index 765994a..cfef7b3 100644 --- a/app/Services/Factories/ArticleParserFactory.php +++ b/app/Services/Factories/ArticleParserFactory.php @@ -6,6 +6,7 @@ use App\Models\Feed; use App\Services\Parsers\VrtArticleParser; use App\Services\Parsers\BelgaArticleParser; +use App\Services\Parsers\GuardianArticleParser; use Exception; class ArticleParserFactory @@ -16,6 +17,7 @@ class ArticleParserFactory private static array $parsers = [ VrtArticleParser::class, BelgaArticleParser::class, + GuardianArticleParser::class, ]; /** diff --git a/app/Services/Parsers/GuardianArticlePageParser.php b/app/Services/Parsers/GuardianArticlePageParser.php new file mode 100644 index 0000000..7f94570 --- /dev/null +++ b/app/Services/Parsers/GuardianArticlePageParser.php @@ -0,0 +1,110 @@ +]*>([^<]+)<\/h1>/i', $html, $matches)) { + return html_entity_decode(strip_tags($matches[1]), ENT_QUOTES, 'UTF-8'); + } + + // Try title tag + if (preg_match('/([^<]+)<\/title>/i', $html, $matches)) { + return html_entity_decode($matches[1], ENT_QUOTES, 'UTF-8'); + } + + return null; + } + + public static function extractDescription(string $html): ?string + { + // Try meta description first + if (preg_match('/<meta property="og:description" content="([^"]+)"/i', $html, $matches)) { + return html_entity_decode($matches[1], ENT_QUOTES, 'UTF-8'); + } + + // Try first paragraph + if (preg_match('/<p[^>]*>([^<]+(?:<[^\/](?!p)[^>]*>[^<]*<\/[^>]*>[^<]*)*)<\/p>/i', $html, $matches)) { + return html_entity_decode(strip_tags($matches[1]), ENT_QUOTES, 'UTF-8'); + } + + return null; + } + + public static function extractFullArticle(string $html): ?string + { + // Remove scripts, styles, and other non-content elements + $cleanHtml = preg_replace('/<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/mi', '', $html); + $cleanHtml = preg_replace('/<style\b[^<]*(?:(?!<\/style>)<[^<]*)*<\/style>/mi', '', $cleanHtml); + + // Try Guardian-specific article body container (greedy to avoid stopping at nested divs) + if (preg_match('/<div[^>]*class="[^"]*article-body-commercial-selector[^"]*"[^>]*>(.*)<\/div>/is', $cleanHtml, $sectionMatches)) { + $sectionHtml = $sectionMatches[1]; + preg_match_all('/<p[^>]*>(.*?)<\/p>/is', $sectionHtml, $matches); + + if (!empty($matches[1])) { + return self::joinParagraphs($matches[1]); + } + } + + // Fallback: extract all paragraph content + preg_match_all('/<p[^>]*>(.*?)<\/p>/is', $cleanHtml, $matches); + if (!empty($matches[1])) { + return self::joinParagraphs($matches[1]); + } + + return null; + } + + public static function extractThumbnail(string $html): ?string + { + // Try OpenGraph image first + if (preg_match('/<meta property="og:image" content="([^"]+)"/i', $html, $matches)) { + return $matches[1]; + } + + // Try first image in content + if (preg_match('/<img[^>]+src="([^"]+)"/i', $html, $matches)) { + return $matches[1]; + } + + return null; + } + + /** + * @return array<string, string|null> + */ + public static function extractData(string $html): array + { + return [ + 'title' => self::extractTitle($html), + 'description' => self::extractDescription($html), + 'full_article' => self::extractFullArticle($html), + 'thumbnail' => self::extractThumbnail($html), + ]; + } + + /** + * @param array<int, string> $paragraphs + */ + private static function joinParagraphs(array $paragraphs): ?string + { + $paragraphs = array_map(function ($paragraph) { + return html_entity_decode(strip_tags($paragraph), ENT_QUOTES, 'UTF-8'); + }, $paragraphs); + + $fullText = implode("\n\n", array_filter($paragraphs, function ($p) { + return trim($p) !== ''; + })); + + return $fullText ?: null; + } +} \ No newline at end of file diff --git a/app/Services/Parsers/GuardianArticleParser.php b/app/Services/Parsers/GuardianArticleParser.php new file mode 100644 index 0000000..a363199 --- /dev/null +++ b/app/Services/Parsers/GuardianArticleParser.php @@ -0,0 +1,23 @@ +<?php + +namespace App\Services\Parsers; + +use App\Contracts\ArticleParserInterface; + +class GuardianArticleParser implements ArticleParserInterface +{ + public function canParse(string $url): bool + { + return str_contains($url, 'theguardian.com'); + } + + public function extractData(string $html): array + { + return GuardianArticlePageParser::extractData($html); + } + + public function getSourceName(): string + { + return 'The Guardian'; + } +} \ No newline at end of file diff --git a/config/feed.php b/config/feed.php index b6bdfd7..4ddb9ea 100644 --- a/config/feed.php +++ b/config/feed.php @@ -33,7 +33,7 @@ 'code' => 'belga', 'name' => 'Belga News Agency', 'description' => 'Belgian national news agency', - 'type' => 'rss', + 'type' => 'website', 'is_active' => true, 'languages' => [ 'en' => ['url' => 'https://www.belganewsagency.eu/'], @@ -44,6 +44,20 @@ 'article_page' => \App\Services\Parsers\BelgaArticlePageParser::class, ], ], + 'guardian' => [ + 'code' => 'guardian', + 'name' => 'The Guardian', + 'description' => 'British daily newspaper', + 'type' => 'rss', + 'is_active' => true, + 'languages' => [ + 'en' => ['url' => 'https://www.theguardian.com/international/rss'], + ], + 'parsers' => [ + 'article' => \App\Services\Parsers\GuardianArticleParser::class, + 'article_page' => \App\Services\Parsers\GuardianArticlePageParser::class, + ], + ], ], /* diff --git a/tests/Feature/Http/Controllers/Api/V1/FeedsControllerTest.php b/tests/Feature/Http/Controllers/Api/V1/FeedsControllerTest.php index 25c2695..143a8f5 100644 --- a/tests/Feature/Http/Controllers/Api/V1/FeedsControllerTest.php +++ b/tests/Feature/Http/Controllers/Api/V1/FeedsControllerTest.php @@ -99,7 +99,7 @@ public function test_store_creates_belga_feed_successfully(): void 'data' => [ 'name' => 'Belga Test Feed', 'url' => 'https://www.belganewsagency.eu/', - 'type' => 'rss', + 'type' => 'website', 'is_active' => true, ] ]); @@ -107,6 +107,38 @@ public function test_store_creates_belga_feed_successfully(): void $this->assertDatabaseHas('feeds', [ 'name' => 'Belga Test Feed', 'url' => 'https://www.belganewsagency.eu/', + 'type' => 'website', + ]); + } + + public function test_store_creates_guardian_feed_successfully(): void + { + $language = Language::factory()->english()->create(); + + $feedData = [ + 'name' => 'Guardian Test Feed', + 'provider' => 'guardian', + 'language_id' => $language->id, + 'is_active' => true, + ]; + + $response = $this->postJson('/api/v1/feeds', $feedData); + + $response->assertStatus(201) + ->assertJson([ + 'success' => true, + 'message' => 'Feed created successfully!', + 'data' => [ + 'name' => 'Guardian Test Feed', + 'url' => 'https://www.theguardian.com/international/rss', + 'type' => 'rss', + 'is_active' => true, + ] + ]); + + $this->assertDatabaseHas('feeds', [ + 'name' => 'Guardian Test Feed', + 'url' => 'https://www.theguardian.com/international/rss', 'type' => 'rss', ]); } diff --git a/tests/Unit/Actions/CreateFeedActionTest.php b/tests/Unit/Actions/CreateFeedActionTest.php index 072bdab..5c4eec6 100644 --- a/tests/Unit/Actions/CreateFeedActionTest.php +++ b/tests/Unit/Actions/CreateFeedActionTest.php @@ -43,11 +43,23 @@ public function test_creates_belga_feed_with_correct_url(): void $feed = $this->action->execute('Belga News', 'belga', $language->id); $this->assertEquals('https://www.belganewsagency.eu/', $feed->url); - $this->assertEquals('rss', $feed->type); + $this->assertEquals('website', $feed->type); $this->assertEquals('belga', $feed->provider); $this->assertNull($feed->description); } + public function test_creates_guardian_feed_with_correct_url(): void + { + $language = Language::factory()->create(['short_code' => 'en', 'is_active' => true]); + + $feed = $this->action->execute('Guardian News', 'guardian', $language->id); + + $this->assertEquals('https://www.theguardian.com/international/rss', $feed->url); + $this->assertEquals('rss', $feed->type); + $this->assertEquals('guardian', $feed->provider); + $this->assertNull($feed->description); + } + public function test_creates_vrt_feed_with_dutch_language(): void { $language = Language::factory()->create(['short_code' => 'nl', 'is_active' => true]); diff --git a/tests/Unit/Services/ArticleFetcherRssTest.php b/tests/Unit/Services/ArticleFetcherRssTest.php new file mode 100644 index 0000000..0479d6a --- /dev/null +++ b/tests/Unit/Services/ArticleFetcherRssTest.php @@ -0,0 +1,164 @@ +<?php + +namespace Tests\Unit\Services; + +use App\Models\Article; +use App\Models\Feed; +use Illuminate\Foundation\Testing\RefreshDatabase; +use Illuminate\Support\Facades\Http; +use Mockery; +use Tests\TestCase; +use Tests\Traits\CreatesArticleFetcher; + +class ArticleFetcherRssTest extends TestCase +{ + use RefreshDatabase, CreatesArticleFetcher; + + private string $sampleRss; + + protected function setUp(): void + { + parent::setUp(); + + $this->sampleRss = <<<'XML' +<?xml version="1.0" encoding="UTF-8"?> +<rss version="2.0"> + <channel> + <title>The Guardian - International + https://www.theguardian.com/international + + First Article Title + https://www.theguardian.com/world/2026/mar/08/first-article + First article description + Sun, 08 Mar 2026 12:00:00 GMT + + + Second Article Title + https://www.theguardian.com/world/2026/mar/08/second-article + Second article description + Sun, 08 Mar 2026 11:00:00 GMT + + + +XML; + } + + public function test_get_articles_from_rss_feed_returns_collection(): void + { + Http::fake(['*' => Http::response($this->sampleRss, 200)]); + + $feed = Feed::factory()->create([ + 'type' => 'rss', + 'provider' => 'guardian', + 'url' => 'https://www.theguardian.com/international/rss', + ]); + + $fetcher = $this->createArticleFetcher(); + $result = $fetcher->getArticlesFromFeed($feed); + + $this->assertInstanceOf(\Illuminate\Support\Collection::class, $result); + } + + public function test_get_articles_from_rss_feed_creates_articles(): void + { + Http::fake(['*' => Http::response($this->sampleRss, 200)]); + + $feed = Feed::factory()->create([ + 'type' => 'rss', + 'provider' => 'guardian', + 'url' => 'https://www.theguardian.com/international/rss', + ]); + + $fetcher = $this->createArticleFetcher(); + $result = $fetcher->getArticlesFromFeed($feed); + + $this->assertCount(2, $result); + $this->assertDatabaseHas('articles', [ + 'url' => 'https://www.theguardian.com/world/2026/mar/08/first-article', + 'feed_id' => $feed->id, + ]); + $this->assertDatabaseHas('articles', [ + 'url' => 'https://www.theguardian.com/world/2026/mar/08/second-article', + 'feed_id' => $feed->id, + ]); + } + + public function test_get_articles_from_rss_feed_does_not_duplicate_existing(): void + { + Http::fake(['*' => Http::response($this->sampleRss, 200)]); + + $feed = Feed::factory()->create([ + 'type' => 'rss', + 'provider' => 'guardian', + 'url' => 'https://www.theguardian.com/international/rss', + ]); + + Article::factory()->create([ + 'url' => 'https://www.theguardian.com/world/2026/mar/08/first-article', + 'feed_id' => $feed->id, + ]); + + $fetcher = $this->createArticleFetcher(); + $result = $fetcher->getArticlesFromFeed($feed); + + $this->assertCount(2, $result); + $this->assertEquals(1, Article::where('url', 'https://www.theguardian.com/world/2026/mar/08/first-article')->count()); + } + + public function test_get_articles_from_rss_feed_handles_invalid_xml(): void + { + Http::fake(['*' => Http::response('this is not xml', 200)]); + + $feed = Feed::factory()->create([ + 'type' => 'rss', + 'provider' => 'guardian', + 'url' => 'https://www.theguardian.com/international/rss', + ]); + + $fetcher = $this->createArticleFetcher(); + $result = $fetcher->getArticlesFromFeed($feed); + + $this->assertInstanceOf(\Illuminate\Support\Collection::class, $result); + $this->assertEmpty($result); + } + + public function test_get_articles_from_rss_feed_handles_empty_channel(): void + { + Http::fake([ + '*' => Http::response('Empty', 200), + ]); + + $feed = Feed::factory()->create([ + 'type' => 'rss', + 'provider' => 'guardian', + 'url' => 'https://www.theguardian.com/international/rss', + ]); + + $fetcher = $this->createArticleFetcher(); + $result = $fetcher->getArticlesFromFeed($feed); + + $this->assertEmpty($result); + } + + public function test_get_articles_from_rss_feed_handles_http_failure(): void + { + Http::fake(['*' => Http::response('Server Error', 500)]); + + $feed = Feed::factory()->create([ + 'type' => 'rss', + 'provider' => 'guardian', + 'url' => 'https://www.theguardian.com/international/rss', + ]); + + $fetcher = $this->createArticleFetcher(); + $result = $fetcher->getArticlesFromFeed($feed); + + $this->assertEmpty($result); + } + + protected function tearDown(): void + { + Mockery::close(); + parent::tearDown(); + } +} \ No newline at end of file diff --git a/tests/Unit/Services/Factories/ArticleParserFactoryTest.php b/tests/Unit/Services/Factories/ArticleParserFactoryTest.php index a0a9808..ef5a24a 100644 --- a/tests/Unit/Services/Factories/ArticleParserFactoryTest.php +++ b/tests/Unit/Services/Factories/ArticleParserFactoryTest.php @@ -46,9 +46,10 @@ public function test_get_supported_sources_returns_array_of_source_names(): void $sources = ArticleParserFactory::getSupportedSources(); $this->assertIsArray($sources); - $this->assertCount(2, $sources); + $this->assertCount(3, $sources); $this->assertContains('VRT News', $sources); $this->assertContains('Belga News Agency', $sources); + $this->assertContains('The Guardian', $sources); } public function test_get_supported_sources_returns_sources_in_correct_order(): void @@ -88,7 +89,7 @@ public function getSourceName(): string // Verify it's now included in supported sources $sources = ArticleParserFactory::getSupportedSources(); $this->assertContains('TestParser', $sources); - $this->assertCount(3, $sources); // Original 2 + 1 new + $this->assertCount(4, $sources); // Original 3 + 1 new // Verify it can be used to parse URLs $testUrl = 'https://test-parser.com/article'; diff --git a/tests/Unit/Services/Parsers/GuardianArticlePageParserTest.php b/tests/Unit/Services/Parsers/GuardianArticlePageParserTest.php new file mode 100644 index 0000000..a0126d0 --- /dev/null +++ b/tests/Unit/Services/Parsers/GuardianArticlePageParserTest.php @@ -0,0 +1,285 @@ +'; + + $title = GuardianArticlePageParser::extractTitle($html); + + $this->assertEquals('Guardian Article Title', $title); + } + + public function test_extract_title_from_h1_tag(): void + { + $html = '

H1 Title Test

'; + + $title = GuardianArticlePageParser::extractTitle($html); + + $this->assertEquals('H1 Title Test', $title); + } + + public function test_extract_title_from_title_tag(): void + { + $html = 'Page Title Test'; + + $title = GuardianArticlePageParser::extractTitle($html); + + $this->assertEquals('Page Title Test', $title); + } + + public function test_extract_title_with_html_entities(): void + { + $html = ''; + + $title = GuardianArticlePageParser::extractTitle($html); + + $this->assertEquals('Test & Article "Title"', $title); + } + + public function test_extract_title_returns_null_when_not_found(): void + { + $html = '

No title here

'; + + $title = GuardianArticlePageParser::extractTitle($html); + + $this->assertNull($title); + } + + public function test_extract_description_from_og_meta_tag(): void + { + $html = ''; + + $description = GuardianArticlePageParser::extractDescription($html); + + $this->assertEquals('Guardian article description', $description); + } + + public function test_extract_description_from_paragraph(): void + { + $html = '

This is the first paragraph description.

'; + + $description = GuardianArticlePageParser::extractDescription($html); + + $this->assertEquals('This is the first paragraph description.', $description); + } + + public function test_extract_description_returns_null_when_not_found(): void + { + $html = '
No description here
'; + + $description = GuardianArticlePageParser::extractDescription($html); + + $this->assertNull($description); + } + + public function test_extract_full_article_from_guardian_article_body(): void + { + $html = ' + + +
+

First paragraph of the article.

+

Second paragraph of the article.

+
+ + + '; + + $fullArticle = GuardianArticlePageParser::extractFullArticle($html); + + $expected = "First paragraph of the article.\n\nSecond paragraph of the article."; + $this->assertEquals($expected, $fullArticle); + } + + public function test_extract_full_article_fallback_to_all_paragraphs(): void + { + $html = ' + + +

First general paragraph.

+

Second general paragraph.

+ + + '; + + $fullArticle = GuardianArticlePageParser::extractFullArticle($html); + + $expected = "First general paragraph.\n\nSecond general paragraph."; + $this->assertEquals($expected, $fullArticle); + } + + public function test_extract_full_article_filters_empty_paragraphs(): void + { + $html = ' + + +
+

Content paragraph.

+

+

+

Another content paragraph.

+
+ + + '; + + $fullArticle = GuardianArticlePageParser::extractFullArticle($html); + + $expected = "Content paragraph.\n\nAnother content paragraph."; + $this->assertEquals($expected, $fullArticle); + } + + public function test_extract_full_article_handles_nested_tags(): void + { + $html = ' + + +
+

This has bold text and italic text.

+

This has a link inside.

+
+ + + '; + + $fullArticle = GuardianArticlePageParser::extractFullArticle($html); + + $expected = "This has bold text and italic text.\n\nThis has a link inside."; + $this->assertEquals($expected, $fullArticle); + } + + public function test_extract_full_article_removes_scripts_and_styles(): void + { + $html = ' + + + + + + +
+

Clean content.

+
+ + + + '; + + $fullArticle = GuardianArticlePageParser::extractFullArticle($html); + + $this->assertEquals('Clean content.', $fullArticle); + $this->assertStringNotContainsString('console.log', $fullArticle); + $this->assertStringNotContainsString('alert', $fullArticle); + } + + public function test_extract_full_article_returns_null_when_no_content(): void + { + $html = '
No paragraphs here
'; + + $fullArticle = GuardianArticlePageParser::extractFullArticle($html); + + $this->assertNull($fullArticle); + } + + public function test_extract_thumbnail_from_og_image(): void + { + $html = ''; + + $thumbnail = GuardianArticlePageParser::extractThumbnail($html); + + $this->assertEquals('https://i.guim.co.uk/img/test.jpg', $thumbnail); + } + + public function test_extract_thumbnail_from_img_tag(): void + { + $html = 'test'; + + $thumbnail = GuardianArticlePageParser::extractThumbnail($html); + + $this->assertEquals('https://i.guim.co.uk/img/article-image.png', $thumbnail); + } + + public function test_extract_thumbnail_returns_null_when_not_found(): void + { + $html = '
No images here
'; + + $thumbnail = GuardianArticlePageParser::extractThumbnail($html); + + $this->assertNull($thumbnail); + } + + public function test_extract_data_returns_all_components(): void + { + $html = ' + + + + + + + +
+

Full article content here.

+
+ + + '; + + $data = GuardianArticlePageParser::extractData($html); + + $this->assertIsArray($data); + $this->assertArrayHasKey('title', $data); + $this->assertArrayHasKey('description', $data); + $this->assertArrayHasKey('full_article', $data); + $this->assertArrayHasKey('thumbnail', $data); + + $this->assertEquals('Guardian Test Article', $data['title']); + $this->assertEquals('Test description', $data['description']); + $this->assertEquals('Full article content here.', $data['full_article']); + $this->assertEquals('https://i.guim.co.uk/img/image.jpg', $data['thumbnail']); + } + + public function test_extract_data_handles_missing_components_gracefully(): void + { + $html = '
Minimal content
'; + + $data = GuardianArticlePageParser::extractData($html); + + $this->assertIsArray($data); + $this->assertNull($data['title']); + $this->assertNull($data['description']); + $this->assertNull($data['full_article']); + $this->assertNull($data['thumbnail']); + } + + public function test_extract_full_article_with_realistic_guardian_html(): void + { + $html = ' + + +
+

The prime minister has announced a new climate policy that aims to reduce carbon emissions by 50% by 2030.

+

The announcement came during a press conference at Downing Street on Tuesday afternoon.

+

Environmental groups have cautiously welcomed the move, while industry leaders have expressed concern about the timeline.

+
+ + + '; + + $fullArticle = GuardianArticlePageParser::extractFullArticle($html); + + $this->assertNotNull($fullArticle); + $this->assertStringContainsString('climate policy', $fullArticle); + $this->assertStringContainsString('press conference', $fullArticle); + $this->assertStringContainsString('Environmental groups', $fullArticle); + $this->assertStringContainsString("\n\n", $fullArticle); + $this->assertStringNotContainsString('', $fullArticle); + } +} \ No newline at end of file diff --git a/tests/Unit/Services/Parsers/GuardianArticleParserTest.php b/tests/Unit/Services/Parsers/GuardianArticleParserTest.php new file mode 100644 index 0000000..51bcd43 --- /dev/null +++ b/tests/Unit/Services/Parsers/GuardianArticleParserTest.php @@ -0,0 +1,63 @@ +parser = new GuardianArticleParser(); + } + + public function test_implements_article_parser_interface(): void + { + $this->assertInstanceOf(ArticleParserInterface::class, $this->parser); + } + + public function test_can_parse_guardian_url(): void + { + $this->assertTrue($this->parser->canParse('https://www.theguardian.com/world/2026/mar/08/some-article')); + } + + public function test_can_parse_guardian_url_without_www(): void + { + $this->assertTrue($this->parser->canParse('https://theguardian.com/world/2026/mar/08/some-article')); + } + + public function test_cannot_parse_non_guardian_url(): void + { + $this->assertFalse($this->parser->canParse('https://www.vrt.be/vrtnws/en/article')); + $this->assertFalse($this->parser->canParse('https://www.belganewsagency.eu/article')); + } + + public function test_get_source_name(): void + { + $this->assertEquals('The Guardian', $this->parser->getSourceName()); + } + + public function test_extract_data_delegates_to_page_parser(): void + { + $html = ' + + + + + +

Content

+ + '; + + $data = $this->parser->extractData($html); + + $this->assertIsArray($data); + $this->assertArrayHasKey('title', $data); + $this->assertEquals('Test Title', $data['title']); + } +} \ No newline at end of file