37 - Add The Guardian as RSS feed provider, implement RSS parsing

This commit is contained in:
myrmidex 2026-03-08 11:02:46 +01:00
parent 0123e20b1d
commit 1e39a25f83
12 changed files with 754 additions and 10 deletions

View file

@ -12,13 +12,15 @@ public function authorize(): bool
} }
/** /**
* @return array<string, string> * @return array<string, mixed>
*/ */
public function rules(): array public function rules(): array
{ {
$providers = implode(',', array_keys(config('feed.providers', [])));
return [ return [
'name' => 'required|string|max:255', 'name' => 'required|string|max:255',
'provider' => 'required|in:vrt,belga', 'provider' => "required|in:{$providers}",
'language_id' => 'required|exists:languages,id', 'language_id' => 'required|exists:languages,id',
'description' => 'nullable|string', 'description' => 'nullable|string',
'is_active' => 'boolean' 'is_active' => 'boolean'

View file

@ -41,11 +41,47 @@ public function getArticlesFromFeed(Feed $feed): Collection
*/ */
private function getArticlesFromRssFeed(Feed $feed): Collection private function getArticlesFromRssFeed(Feed $feed): Collection
{ {
// TODO: Implement RSS feed parsing try {
// For now, return empty collection $xml = HttpFetcher::fetchHtml($feed->url);
$previousUseErrors = libxml_use_internal_errors(true);
try {
$rss = simplexml_load_string($xml);
} finally {
libxml_clear_errors();
libxml_use_internal_errors($previousUseErrors);
}
if ($rss === false || !isset($rss->channel->item)) {
$this->logSaver->warning("Failed to parse RSS feed XML", null, [
'feed_id' => $feed->id,
'feed_url' => $feed->url,
]);
return collect(); return collect();
} }
$articles = collect();
foreach ($rss->channel->item as $item) {
$link = (string) $item->link;
if ($link !== '') {
$articles->push($this->saveArticle($link, $feed->id));
}
}
return $articles;
} catch (Exception $e) {
$this->logSaver->error("Failed to fetch articles from RSS feed", null, [
'feed_id' => $feed->id,
'feed_url' => $feed->url,
'error' => $e->getMessage(),
]);
return collect();
}
}
/** /**
* @return Collection<int, Article> * @return Collection<int, Article>
*/ */

View file

@ -6,6 +6,7 @@
use App\Models\Feed; use App\Models\Feed;
use App\Services\Parsers\VrtArticleParser; use App\Services\Parsers\VrtArticleParser;
use App\Services\Parsers\BelgaArticleParser; use App\Services\Parsers\BelgaArticleParser;
use App\Services\Parsers\GuardianArticleParser;
use Exception; use Exception;
class ArticleParserFactory class ArticleParserFactory
@ -16,6 +17,7 @@ class ArticleParserFactory
private static array $parsers = [ private static array $parsers = [
VrtArticleParser::class, VrtArticleParser::class,
BelgaArticleParser::class, BelgaArticleParser::class,
GuardianArticleParser::class,
]; ];
/** /**

View file

@ -0,0 +1,110 @@
<?php
namespace App\Services\Parsers;
class GuardianArticlePageParser
{
public static function extractTitle(string $html): ?string
{
// Try meta title first
if (preg_match('/<meta property="og:title" content="([^"]+)"/i', $html, $matches)) {
return html_entity_decode($matches[1], ENT_QUOTES, 'UTF-8');
}
// Try any h1 tag
if (preg_match('/<h1[^>]*>([^<]+)<\/h1>/i', $html, $matches)) {
return html_entity_decode(strip_tags($matches[1]), ENT_QUOTES, 'UTF-8');
}
// Try title tag
if (preg_match('/<title>([^<]+)<\/title>/i', $html, $matches)) {
return html_entity_decode($matches[1], ENT_QUOTES, 'UTF-8');
}
return null;
}
public static function extractDescription(string $html): ?string
{
// Try meta description first
if (preg_match('/<meta property="og:description" content="([^"]+)"/i', $html, $matches)) {
return html_entity_decode($matches[1], ENT_QUOTES, 'UTF-8');
}
// Try first paragraph
if (preg_match('/<p[^>]*>([^<]+(?:<[^\/](?!p)[^>]*>[^<]*<\/[^>]*>[^<]*)*)<\/p>/i', $html, $matches)) {
return html_entity_decode(strip_tags($matches[1]), ENT_QUOTES, 'UTF-8');
}
return null;
}
public static function extractFullArticle(string $html): ?string
{
// Remove scripts, styles, and other non-content elements
$cleanHtml = preg_replace('/<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/mi', '', $html);
$cleanHtml = preg_replace('/<style\b[^<]*(?:(?!<\/style>)<[^<]*)*<\/style>/mi', '', $cleanHtml);
// Try Guardian-specific article body container (greedy to avoid stopping at nested divs)
if (preg_match('/<div[^>]*class="[^"]*article-body-commercial-selector[^"]*"[^>]*>(.*)<\/div>/is', $cleanHtml, $sectionMatches)) {
$sectionHtml = $sectionMatches[1];
preg_match_all('/<p[^>]*>(.*?)<\/p>/is', $sectionHtml, $matches);
if (!empty($matches[1])) {
return self::joinParagraphs($matches[1]);
}
}
// Fallback: extract all paragraph content
preg_match_all('/<p[^>]*>(.*?)<\/p>/is', $cleanHtml, $matches);
if (!empty($matches[1])) {
return self::joinParagraphs($matches[1]);
}
return null;
}
public static function extractThumbnail(string $html): ?string
{
// Try OpenGraph image first
if (preg_match('/<meta property="og:image" content="([^"]+)"/i', $html, $matches)) {
return $matches[1];
}
// Try first image in content
if (preg_match('/<img[^>]+src="([^"]+)"/i', $html, $matches)) {
return $matches[1];
}
return null;
}
/**
* @return array<string, string|null>
*/
public static function extractData(string $html): array
{
return [
'title' => self::extractTitle($html),
'description' => self::extractDescription($html),
'full_article' => self::extractFullArticle($html),
'thumbnail' => self::extractThumbnail($html),
];
}
/**
* @param array<int, string> $paragraphs
*/
private static function joinParagraphs(array $paragraphs): ?string
{
$paragraphs = array_map(function ($paragraph) {
return html_entity_decode(strip_tags($paragraph), ENT_QUOTES, 'UTF-8');
}, $paragraphs);
$fullText = implode("\n\n", array_filter($paragraphs, function ($p) {
return trim($p) !== '';
}));
return $fullText ?: null;
}
}

View file

@ -0,0 +1,23 @@
<?php
namespace App\Services\Parsers;
use App\Contracts\ArticleParserInterface;
class GuardianArticleParser implements ArticleParserInterface
{
public function canParse(string $url): bool
{
return str_contains($url, 'theguardian.com');
}
public function extractData(string $html): array
{
return GuardianArticlePageParser::extractData($html);
}
public function getSourceName(): string
{
return 'The Guardian';
}
}

View file

@ -33,7 +33,7 @@
'code' => 'belga', 'code' => 'belga',
'name' => 'Belga News Agency', 'name' => 'Belga News Agency',
'description' => 'Belgian national news agency', 'description' => 'Belgian national news agency',
'type' => 'rss', 'type' => 'website',
'is_active' => true, 'is_active' => true,
'languages' => [ 'languages' => [
'en' => ['url' => 'https://www.belganewsagency.eu/'], 'en' => ['url' => 'https://www.belganewsagency.eu/'],
@ -44,6 +44,20 @@
'article_page' => \App\Services\Parsers\BelgaArticlePageParser::class, 'article_page' => \App\Services\Parsers\BelgaArticlePageParser::class,
], ],
], ],
'guardian' => [
'code' => 'guardian',
'name' => 'The Guardian',
'description' => 'British daily newspaper',
'type' => 'rss',
'is_active' => true,
'languages' => [
'en' => ['url' => 'https://www.theguardian.com/international/rss'],
],
'parsers' => [
'article' => \App\Services\Parsers\GuardianArticleParser::class,
'article_page' => \App\Services\Parsers\GuardianArticlePageParser::class,
],
],
], ],
/* /*

View file

@ -99,7 +99,7 @@ public function test_store_creates_belga_feed_successfully(): void
'data' => [ 'data' => [
'name' => 'Belga Test Feed', 'name' => 'Belga Test Feed',
'url' => 'https://www.belganewsagency.eu/', 'url' => 'https://www.belganewsagency.eu/',
'type' => 'rss', 'type' => 'website',
'is_active' => true, 'is_active' => true,
] ]
]); ]);
@ -107,6 +107,38 @@ public function test_store_creates_belga_feed_successfully(): void
$this->assertDatabaseHas('feeds', [ $this->assertDatabaseHas('feeds', [
'name' => 'Belga Test Feed', 'name' => 'Belga Test Feed',
'url' => 'https://www.belganewsagency.eu/', 'url' => 'https://www.belganewsagency.eu/',
'type' => 'website',
]);
}
public function test_store_creates_guardian_feed_successfully(): void
{
$language = Language::factory()->english()->create();
$feedData = [
'name' => 'Guardian Test Feed',
'provider' => 'guardian',
'language_id' => $language->id,
'is_active' => true,
];
$response = $this->postJson('/api/v1/feeds', $feedData);
$response->assertStatus(201)
->assertJson([
'success' => true,
'message' => 'Feed created successfully!',
'data' => [
'name' => 'Guardian Test Feed',
'url' => 'https://www.theguardian.com/international/rss',
'type' => 'rss',
'is_active' => true,
]
]);
$this->assertDatabaseHas('feeds', [
'name' => 'Guardian Test Feed',
'url' => 'https://www.theguardian.com/international/rss',
'type' => 'rss', 'type' => 'rss',
]); ]);
} }

View file

@ -43,11 +43,23 @@ public function test_creates_belga_feed_with_correct_url(): void
$feed = $this->action->execute('Belga News', 'belga', $language->id); $feed = $this->action->execute('Belga News', 'belga', $language->id);
$this->assertEquals('https://www.belganewsagency.eu/', $feed->url); $this->assertEquals('https://www.belganewsagency.eu/', $feed->url);
$this->assertEquals('rss', $feed->type); $this->assertEquals('website', $feed->type);
$this->assertEquals('belga', $feed->provider); $this->assertEquals('belga', $feed->provider);
$this->assertNull($feed->description); $this->assertNull($feed->description);
} }
public function test_creates_guardian_feed_with_correct_url(): void
{
$language = Language::factory()->create(['short_code' => 'en', 'is_active' => true]);
$feed = $this->action->execute('Guardian News', 'guardian', $language->id);
$this->assertEquals('https://www.theguardian.com/international/rss', $feed->url);
$this->assertEquals('rss', $feed->type);
$this->assertEquals('guardian', $feed->provider);
$this->assertNull($feed->description);
}
public function test_creates_vrt_feed_with_dutch_language(): void public function test_creates_vrt_feed_with_dutch_language(): void
{ {
$language = Language::factory()->create(['short_code' => 'nl', 'is_active' => true]); $language = Language::factory()->create(['short_code' => 'nl', 'is_active' => true]);

View file

@ -0,0 +1,164 @@
<?php
namespace Tests\Unit\Services;
use App\Models\Article;
use App\Models\Feed;
use Illuminate\Foundation\Testing\RefreshDatabase;
use Illuminate\Support\Facades\Http;
use Mockery;
use Tests\TestCase;
use Tests\Traits\CreatesArticleFetcher;
class ArticleFetcherRssTest extends TestCase
{
use RefreshDatabase, CreatesArticleFetcher;
private string $sampleRss;
protected function setUp(): void
{
parent::setUp();
$this->sampleRss = <<<'XML'
<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0">
<channel>
<title>The Guardian - International</title>
<link>https://www.theguardian.com/international</link>
<item>
<title>First Article Title</title>
<link>https://www.theguardian.com/world/2026/mar/08/first-article</link>
<description>First article description</description>
<pubDate>Sun, 08 Mar 2026 12:00:00 GMT</pubDate>
</item>
<item>
<title>Second Article Title</title>
<link>https://www.theguardian.com/world/2026/mar/08/second-article</link>
<description>Second article description</description>
<pubDate>Sun, 08 Mar 2026 11:00:00 GMT</pubDate>
</item>
</channel>
</rss>
XML;
}
public function test_get_articles_from_rss_feed_returns_collection(): void
{
Http::fake(['*' => Http::response($this->sampleRss, 200)]);
$feed = Feed::factory()->create([
'type' => 'rss',
'provider' => 'guardian',
'url' => 'https://www.theguardian.com/international/rss',
]);
$fetcher = $this->createArticleFetcher();
$result = $fetcher->getArticlesFromFeed($feed);
$this->assertInstanceOf(\Illuminate\Support\Collection::class, $result);
}
public function test_get_articles_from_rss_feed_creates_articles(): void
{
Http::fake(['*' => Http::response($this->sampleRss, 200)]);
$feed = Feed::factory()->create([
'type' => 'rss',
'provider' => 'guardian',
'url' => 'https://www.theguardian.com/international/rss',
]);
$fetcher = $this->createArticleFetcher();
$result = $fetcher->getArticlesFromFeed($feed);
$this->assertCount(2, $result);
$this->assertDatabaseHas('articles', [
'url' => 'https://www.theguardian.com/world/2026/mar/08/first-article',
'feed_id' => $feed->id,
]);
$this->assertDatabaseHas('articles', [
'url' => 'https://www.theguardian.com/world/2026/mar/08/second-article',
'feed_id' => $feed->id,
]);
}
public function test_get_articles_from_rss_feed_does_not_duplicate_existing(): void
{
Http::fake(['*' => Http::response($this->sampleRss, 200)]);
$feed = Feed::factory()->create([
'type' => 'rss',
'provider' => 'guardian',
'url' => 'https://www.theguardian.com/international/rss',
]);
Article::factory()->create([
'url' => 'https://www.theguardian.com/world/2026/mar/08/first-article',
'feed_id' => $feed->id,
]);
$fetcher = $this->createArticleFetcher();
$result = $fetcher->getArticlesFromFeed($feed);
$this->assertCount(2, $result);
$this->assertEquals(1, Article::where('url', 'https://www.theguardian.com/world/2026/mar/08/first-article')->count());
}
public function test_get_articles_from_rss_feed_handles_invalid_xml(): void
{
Http::fake(['*' => Http::response('this is not xml', 200)]);
$feed = Feed::factory()->create([
'type' => 'rss',
'provider' => 'guardian',
'url' => 'https://www.theguardian.com/international/rss',
]);
$fetcher = $this->createArticleFetcher();
$result = $fetcher->getArticlesFromFeed($feed);
$this->assertInstanceOf(\Illuminate\Support\Collection::class, $result);
$this->assertEmpty($result);
}
public function test_get_articles_from_rss_feed_handles_empty_channel(): void
{
Http::fake([
'*' => Http::response('<?xml version="1.0"?><rss><channel><title>Empty</title></channel></rss>', 200),
]);
$feed = Feed::factory()->create([
'type' => 'rss',
'provider' => 'guardian',
'url' => 'https://www.theguardian.com/international/rss',
]);
$fetcher = $this->createArticleFetcher();
$result = $fetcher->getArticlesFromFeed($feed);
$this->assertEmpty($result);
}
public function test_get_articles_from_rss_feed_handles_http_failure(): void
{
Http::fake(['*' => Http::response('Server Error', 500)]);
$feed = Feed::factory()->create([
'type' => 'rss',
'provider' => 'guardian',
'url' => 'https://www.theguardian.com/international/rss',
]);
$fetcher = $this->createArticleFetcher();
$result = $fetcher->getArticlesFromFeed($feed);
$this->assertEmpty($result);
}
protected function tearDown(): void
{
Mockery::close();
parent::tearDown();
}
}

View file

@ -46,9 +46,10 @@ public function test_get_supported_sources_returns_array_of_source_names(): void
$sources = ArticleParserFactory::getSupportedSources(); $sources = ArticleParserFactory::getSupportedSources();
$this->assertIsArray($sources); $this->assertIsArray($sources);
$this->assertCount(2, $sources); $this->assertCount(3, $sources);
$this->assertContains('VRT News', $sources); $this->assertContains('VRT News', $sources);
$this->assertContains('Belga News Agency', $sources); $this->assertContains('Belga News Agency', $sources);
$this->assertContains('The Guardian', $sources);
} }
public function test_get_supported_sources_returns_sources_in_correct_order(): void public function test_get_supported_sources_returns_sources_in_correct_order(): void
@ -88,7 +89,7 @@ public function getSourceName(): string
// Verify it's now included in supported sources // Verify it's now included in supported sources
$sources = ArticleParserFactory::getSupportedSources(); $sources = ArticleParserFactory::getSupportedSources();
$this->assertContains('TestParser', $sources); $this->assertContains('TestParser', $sources);
$this->assertCount(3, $sources); // Original 2 + 1 new $this->assertCount(4, $sources); // Original 3 + 1 new
// Verify it can be used to parse URLs // Verify it can be used to parse URLs
$testUrl = 'https://test-parser.com/article'; $testUrl = 'https://test-parser.com/article';

View file

@ -0,0 +1,285 @@
<?php
namespace Tests\Unit\Services\Parsers;
use App\Services\Parsers\GuardianArticlePageParser;
use Tests\TestCase;
class GuardianArticlePageParserTest extends TestCase
{
public function test_extract_title_from_og_meta_tag(): void
{
$html = '<html><head><meta property="og:title" content="Guardian Article Title"/></head><body></body></html>';
$title = GuardianArticlePageParser::extractTitle($html);
$this->assertEquals('Guardian Article Title', $title);
}
public function test_extract_title_from_h1_tag(): void
{
$html = '<html><body><h1>H1 Title Test</h1></body></html>';
$title = GuardianArticlePageParser::extractTitle($html);
$this->assertEquals('H1 Title Test', $title);
}
public function test_extract_title_from_title_tag(): void
{
$html = '<html><head><title>Page Title Test</title></head><body></body></html>';
$title = GuardianArticlePageParser::extractTitle($html);
$this->assertEquals('Page Title Test', $title);
}
public function test_extract_title_with_html_entities(): void
{
$html = '<html><head><meta property="og:title" content="Test &amp; Article &quot;Title&quot;"/></head></html>';
$title = GuardianArticlePageParser::extractTitle($html);
$this->assertEquals('Test & Article "Title"', $title);
}
public function test_extract_title_returns_null_when_not_found(): void
{
$html = '<html><body><p>No title here</p></body></html>';
$title = GuardianArticlePageParser::extractTitle($html);
$this->assertNull($title);
}
public function test_extract_description_from_og_meta_tag(): void
{
$html = '<html><head><meta property="og:description" content="Guardian article description"/></head></html>';
$description = GuardianArticlePageParser::extractDescription($html);
$this->assertEquals('Guardian article description', $description);
}
public function test_extract_description_from_paragraph(): void
{
$html = '<html><body><p>This is the first paragraph description.</p></body></html>';
$description = GuardianArticlePageParser::extractDescription($html);
$this->assertEquals('This is the first paragraph description.', $description);
}
public function test_extract_description_returns_null_when_not_found(): void
{
$html = '<html><body><div>No description here</div></body></html>';
$description = GuardianArticlePageParser::extractDescription($html);
$this->assertNull($description);
}
public function test_extract_full_article_from_guardian_article_body(): void
{
$html = '
<html>
<body>
<div class="article-body-commercial-selector">
<p>First paragraph of the article.</p>
<p>Second paragraph of the article.</p>
</div>
</body>
</html>
';
$fullArticle = GuardianArticlePageParser::extractFullArticle($html);
$expected = "First paragraph of the article.\n\nSecond paragraph of the article.";
$this->assertEquals($expected, $fullArticle);
}
public function test_extract_full_article_fallback_to_all_paragraphs(): void
{
$html = '
<html>
<body>
<p>First general paragraph.</p>
<p>Second general paragraph.</p>
</body>
</html>
';
$fullArticle = GuardianArticlePageParser::extractFullArticle($html);
$expected = "First general paragraph.\n\nSecond general paragraph.";
$this->assertEquals($expected, $fullArticle);
}
public function test_extract_full_article_filters_empty_paragraphs(): void
{
$html = '
<html>
<body>
<div class="article-body-commercial-selector">
<p>Content paragraph.</p>
<p> </p>
<p></p>
<p>Another content paragraph.</p>
</div>
</body>
</html>
';
$fullArticle = GuardianArticlePageParser::extractFullArticle($html);
$expected = "Content paragraph.\n\nAnother content paragraph.";
$this->assertEquals($expected, $fullArticle);
}
public function test_extract_full_article_handles_nested_tags(): void
{
$html = '
<html>
<body>
<div class="article-body-commercial-selector">
<p>This has <strong>bold text</strong> and <em>italic text</em>.</p>
<p>This has <a href="#">a link</a> inside.</p>
</div>
</body>
</html>
';
$fullArticle = GuardianArticlePageParser::extractFullArticle($html);
$expected = "This has bold text and italic text.\n\nThis has a link inside.";
$this->assertEquals($expected, $fullArticle);
}
public function test_extract_full_article_removes_scripts_and_styles(): void
{
$html = '
<html>
<head>
<script>console.log("test");</script>
<style>.test { color: red; }</style>
</head>
<body>
<div class="article-body-commercial-selector">
<p>Clean content.</p>
</div>
<script>alert("bad");</script>
</body>
</html>
';
$fullArticle = GuardianArticlePageParser::extractFullArticle($html);
$this->assertEquals('Clean content.', $fullArticle);
$this->assertStringNotContainsString('console.log', $fullArticle);
$this->assertStringNotContainsString('alert', $fullArticle);
}
public function test_extract_full_article_returns_null_when_no_content(): void
{
$html = '<html><body><div>No paragraphs here</div></body></html>';
$fullArticle = GuardianArticlePageParser::extractFullArticle($html);
$this->assertNull($fullArticle);
}
public function test_extract_thumbnail_from_og_image(): void
{
$html = '<html><head><meta property="og:image" content="https://i.guim.co.uk/img/test.jpg"/></head></html>';
$thumbnail = GuardianArticlePageParser::extractThumbnail($html);
$this->assertEquals('https://i.guim.co.uk/img/test.jpg', $thumbnail);
}
public function test_extract_thumbnail_from_img_tag(): void
{
$html = '<html><body><img src="https://i.guim.co.uk/img/article-image.png" alt="test"/></body></html>';
$thumbnail = GuardianArticlePageParser::extractThumbnail($html);
$this->assertEquals('https://i.guim.co.uk/img/article-image.png', $thumbnail);
}
public function test_extract_thumbnail_returns_null_when_not_found(): void
{
$html = '<html><body><div>No images here</div></body></html>';
$thumbnail = GuardianArticlePageParser::extractThumbnail($html);
$this->assertNull($thumbnail);
}
public function test_extract_data_returns_all_components(): void
{
$html = '
<html>
<head>
<meta property="og:title" content="Guardian Test Article"/>
<meta property="og:description" content="Test description"/>
<meta property="og:image" content="https://i.guim.co.uk/img/image.jpg"/>
</head>
<body>
<div class="article-body-commercial-selector">
<p>Full article content here.</p>
</div>
</body>
</html>
';
$data = GuardianArticlePageParser::extractData($html);
$this->assertIsArray($data);
$this->assertArrayHasKey('title', $data);
$this->assertArrayHasKey('description', $data);
$this->assertArrayHasKey('full_article', $data);
$this->assertArrayHasKey('thumbnail', $data);
$this->assertEquals('Guardian Test Article', $data['title']);
$this->assertEquals('Test description', $data['description']);
$this->assertEquals('Full article content here.', $data['full_article']);
$this->assertEquals('https://i.guim.co.uk/img/image.jpg', $data['thumbnail']);
}
public function test_extract_data_handles_missing_components_gracefully(): void
{
$html = '<html><body><div>Minimal content</div></body></html>';
$data = GuardianArticlePageParser::extractData($html);
$this->assertIsArray($data);
$this->assertNull($data['title']);
$this->assertNull($data['description']);
$this->assertNull($data['full_article']);
$this->assertNull($data['thumbnail']);
}
public function test_extract_full_article_with_realistic_guardian_html(): void
{
$html = '
<html>
<body>
<div class="article-body-commercial-selector">
<p><strong>The prime minister has announced a new climate policy that aims to reduce carbon emissions by 50% by 2030.</strong></p>
<p>The announcement came during a press conference at Downing Street on Tuesday afternoon.</p>
<p>Environmental groups have cautiously welcomed the move, while industry leaders have expressed concern about the timeline.</p>
</div>
</body>
</html>
';
$fullArticle = GuardianArticlePageParser::extractFullArticle($html);
$this->assertNotNull($fullArticle);
$this->assertStringContainsString('climate policy', $fullArticle);
$this->assertStringContainsString('press conference', $fullArticle);
$this->assertStringContainsString('Environmental groups', $fullArticle);
$this->assertStringContainsString("\n\n", $fullArticle);
$this->assertStringNotContainsString('<strong>', $fullArticle);
}
}

View file

@ -0,0 +1,63 @@
<?php
namespace Tests\Unit\Services\Parsers;
use App\Contracts\ArticleParserInterface;
use App\Services\Parsers\GuardianArticleParser;
use Tests\TestCase;
class GuardianArticleParserTest extends TestCase
{
private GuardianArticleParser $parser;
protected function setUp(): void
{
parent::setUp();
$this->parser = new GuardianArticleParser();
}
public function test_implements_article_parser_interface(): void
{
$this->assertInstanceOf(ArticleParserInterface::class, $this->parser);
}
public function test_can_parse_guardian_url(): void
{
$this->assertTrue($this->parser->canParse('https://www.theguardian.com/world/2026/mar/08/some-article'));
}
public function test_can_parse_guardian_url_without_www(): void
{
$this->assertTrue($this->parser->canParse('https://theguardian.com/world/2026/mar/08/some-article'));
}
public function test_cannot_parse_non_guardian_url(): void
{
$this->assertFalse($this->parser->canParse('https://www.vrt.be/vrtnws/en/article'));
$this->assertFalse($this->parser->canParse('https://www.belganewsagency.eu/article'));
}
public function test_get_source_name(): void
{
$this->assertEquals('The Guardian', $this->parser->getSourceName());
}
public function test_extract_data_delegates_to_page_parser(): void
{
$html = '
<html>
<head>
<meta property="og:title" content="Test Title"/>
<meta property="og:description" content="Test Description"/>
</head>
<body><p>Content</p></body>
</html>
';
$data = $this->parser->extractData($html);
$this->assertIsArray($data);
$this->assertArrayHasKey('title', $data);
$this->assertEquals('Test Title', $data['title']);
}
}