37 - Add The Guardian as RSS feed provider, implement RSS parsing
This commit is contained in:
parent
0123e20b1d
commit
1e39a25f83
12 changed files with 754 additions and 10 deletions
|
|
@ -12,13 +12,15 @@ public function authorize(): bool
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @return array<string, string>
|
* @return array<string, mixed>
|
||||||
*/
|
*/
|
||||||
public function rules(): array
|
public function rules(): array
|
||||||
{
|
{
|
||||||
|
$providers = implode(',', array_keys(config('feed.providers', [])));
|
||||||
|
|
||||||
return [
|
return [
|
||||||
'name' => 'required|string|max:255',
|
'name' => 'required|string|max:255',
|
||||||
'provider' => 'required|in:vrt,belga',
|
'provider' => "required|in:{$providers}",
|
||||||
'language_id' => 'required|exists:languages,id',
|
'language_id' => 'required|exists:languages,id',
|
||||||
'description' => 'nullable|string',
|
'description' => 'nullable|string',
|
||||||
'is_active' => 'boolean'
|
'is_active' => 'boolean'
|
||||||
|
|
|
||||||
|
|
@ -41,9 +41,45 @@ public function getArticlesFromFeed(Feed $feed): Collection
|
||||||
*/
|
*/
|
||||||
private function getArticlesFromRssFeed(Feed $feed): Collection
|
private function getArticlesFromRssFeed(Feed $feed): Collection
|
||||||
{
|
{
|
||||||
// TODO: Implement RSS feed parsing
|
try {
|
||||||
// For now, return empty collection
|
$xml = HttpFetcher::fetchHtml($feed->url);
|
||||||
return collect();
|
|
||||||
|
$previousUseErrors = libxml_use_internal_errors(true);
|
||||||
|
|
||||||
|
try {
|
||||||
|
$rss = simplexml_load_string($xml);
|
||||||
|
} finally {
|
||||||
|
libxml_clear_errors();
|
||||||
|
libxml_use_internal_errors($previousUseErrors);
|
||||||
|
}
|
||||||
|
|
||||||
|
if ($rss === false || !isset($rss->channel->item)) {
|
||||||
|
$this->logSaver->warning("Failed to parse RSS feed XML", null, [
|
||||||
|
'feed_id' => $feed->id,
|
||||||
|
'feed_url' => $feed->url,
|
||||||
|
]);
|
||||||
|
|
||||||
|
return collect();
|
||||||
|
}
|
||||||
|
|
||||||
|
$articles = collect();
|
||||||
|
foreach ($rss->channel->item as $item) {
|
||||||
|
$link = (string) $item->link;
|
||||||
|
if ($link !== '') {
|
||||||
|
$articles->push($this->saveArticle($link, $feed->id));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return $articles;
|
||||||
|
} catch (Exception $e) {
|
||||||
|
$this->logSaver->error("Failed to fetch articles from RSS feed", null, [
|
||||||
|
'feed_id' => $feed->id,
|
||||||
|
'feed_url' => $feed->url,
|
||||||
|
'error' => $e->getMessage(),
|
||||||
|
]);
|
||||||
|
|
||||||
|
return collect();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
||||||
|
|
@ -6,6 +6,7 @@
|
||||||
use App\Models\Feed;
|
use App\Models\Feed;
|
||||||
use App\Services\Parsers\VrtArticleParser;
|
use App\Services\Parsers\VrtArticleParser;
|
||||||
use App\Services\Parsers\BelgaArticleParser;
|
use App\Services\Parsers\BelgaArticleParser;
|
||||||
|
use App\Services\Parsers\GuardianArticleParser;
|
||||||
use Exception;
|
use Exception;
|
||||||
|
|
||||||
class ArticleParserFactory
|
class ArticleParserFactory
|
||||||
|
|
@ -16,6 +17,7 @@ class ArticleParserFactory
|
||||||
private static array $parsers = [
|
private static array $parsers = [
|
||||||
VrtArticleParser::class,
|
VrtArticleParser::class,
|
||||||
BelgaArticleParser::class,
|
BelgaArticleParser::class,
|
||||||
|
GuardianArticleParser::class,
|
||||||
];
|
];
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
||||||
110
app/Services/Parsers/GuardianArticlePageParser.php
Normal file
110
app/Services/Parsers/GuardianArticlePageParser.php
Normal file
|
|
@ -0,0 +1,110 @@
|
||||||
|
<?php
|
||||||
|
|
||||||
|
namespace App\Services\Parsers;
|
||||||
|
|
||||||
|
class GuardianArticlePageParser
|
||||||
|
{
|
||||||
|
public static function extractTitle(string $html): ?string
|
||||||
|
{
|
||||||
|
// Try meta title first
|
||||||
|
if (preg_match('/<meta property="og:title" content="([^"]+)"/i', $html, $matches)) {
|
||||||
|
return html_entity_decode($matches[1], ENT_QUOTES, 'UTF-8');
|
||||||
|
}
|
||||||
|
|
||||||
|
// Try any h1 tag
|
||||||
|
if (preg_match('/<h1[^>]*>([^<]+)<\/h1>/i', $html, $matches)) {
|
||||||
|
return html_entity_decode(strip_tags($matches[1]), ENT_QUOTES, 'UTF-8');
|
||||||
|
}
|
||||||
|
|
||||||
|
// Try title tag
|
||||||
|
if (preg_match('/<title>([^<]+)<\/title>/i', $html, $matches)) {
|
||||||
|
return html_entity_decode($matches[1], ENT_QUOTES, 'UTF-8');
|
||||||
|
}
|
||||||
|
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static function extractDescription(string $html): ?string
|
||||||
|
{
|
||||||
|
// Try meta description first
|
||||||
|
if (preg_match('/<meta property="og:description" content="([^"]+)"/i', $html, $matches)) {
|
||||||
|
return html_entity_decode($matches[1], ENT_QUOTES, 'UTF-8');
|
||||||
|
}
|
||||||
|
|
||||||
|
// Try first paragraph
|
||||||
|
if (preg_match('/<p[^>]*>([^<]+(?:<[^\/](?!p)[^>]*>[^<]*<\/[^>]*>[^<]*)*)<\/p>/i', $html, $matches)) {
|
||||||
|
return html_entity_decode(strip_tags($matches[1]), ENT_QUOTES, 'UTF-8');
|
||||||
|
}
|
||||||
|
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static function extractFullArticle(string $html): ?string
|
||||||
|
{
|
||||||
|
// Remove scripts, styles, and other non-content elements
|
||||||
|
$cleanHtml = preg_replace('/<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/mi', '', $html);
|
||||||
|
$cleanHtml = preg_replace('/<style\b[^<]*(?:(?!<\/style>)<[^<]*)*<\/style>/mi', '', $cleanHtml);
|
||||||
|
|
||||||
|
// Try Guardian-specific article body container (greedy to avoid stopping at nested divs)
|
||||||
|
if (preg_match('/<div[^>]*class="[^"]*article-body-commercial-selector[^"]*"[^>]*>(.*)<\/div>/is', $cleanHtml, $sectionMatches)) {
|
||||||
|
$sectionHtml = $sectionMatches[1];
|
||||||
|
preg_match_all('/<p[^>]*>(.*?)<\/p>/is', $sectionHtml, $matches);
|
||||||
|
|
||||||
|
if (!empty($matches[1])) {
|
||||||
|
return self::joinParagraphs($matches[1]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fallback: extract all paragraph content
|
||||||
|
preg_match_all('/<p[^>]*>(.*?)<\/p>/is', $cleanHtml, $matches);
|
||||||
|
if (!empty($matches[1])) {
|
||||||
|
return self::joinParagraphs($matches[1]);
|
||||||
|
}
|
||||||
|
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static function extractThumbnail(string $html): ?string
|
||||||
|
{
|
||||||
|
// Try OpenGraph image first
|
||||||
|
if (preg_match('/<meta property="og:image" content="([^"]+)"/i', $html, $matches)) {
|
||||||
|
return $matches[1];
|
||||||
|
}
|
||||||
|
|
||||||
|
// Try first image in content
|
||||||
|
if (preg_match('/<img[^>]+src="([^"]+)"/i', $html, $matches)) {
|
||||||
|
return $matches[1];
|
||||||
|
}
|
||||||
|
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @return array<string, string|null>
|
||||||
|
*/
|
||||||
|
public static function extractData(string $html): array
|
||||||
|
{
|
||||||
|
return [
|
||||||
|
'title' => self::extractTitle($html),
|
||||||
|
'description' => self::extractDescription($html),
|
||||||
|
'full_article' => self::extractFullArticle($html),
|
||||||
|
'thumbnail' => self::extractThumbnail($html),
|
||||||
|
];
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @param array<int, string> $paragraphs
|
||||||
|
*/
|
||||||
|
private static function joinParagraphs(array $paragraphs): ?string
|
||||||
|
{
|
||||||
|
$paragraphs = array_map(function ($paragraph) {
|
||||||
|
return html_entity_decode(strip_tags($paragraph), ENT_QUOTES, 'UTF-8');
|
||||||
|
}, $paragraphs);
|
||||||
|
|
||||||
|
$fullText = implode("\n\n", array_filter($paragraphs, function ($p) {
|
||||||
|
return trim($p) !== '';
|
||||||
|
}));
|
||||||
|
|
||||||
|
return $fullText ?: null;
|
||||||
|
}
|
||||||
|
}
|
||||||
23
app/Services/Parsers/GuardianArticleParser.php
Normal file
23
app/Services/Parsers/GuardianArticleParser.php
Normal file
|
|
@ -0,0 +1,23 @@
|
||||||
|
<?php
|
||||||
|
|
||||||
|
namespace App\Services\Parsers;
|
||||||
|
|
||||||
|
use App\Contracts\ArticleParserInterface;
|
||||||
|
|
||||||
|
class GuardianArticleParser implements ArticleParserInterface
|
||||||
|
{
|
||||||
|
public function canParse(string $url): bool
|
||||||
|
{
|
||||||
|
return str_contains($url, 'theguardian.com');
|
||||||
|
}
|
||||||
|
|
||||||
|
public function extractData(string $html): array
|
||||||
|
{
|
||||||
|
return GuardianArticlePageParser::extractData($html);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function getSourceName(): string
|
||||||
|
{
|
||||||
|
return 'The Guardian';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -33,7 +33,7 @@
|
||||||
'code' => 'belga',
|
'code' => 'belga',
|
||||||
'name' => 'Belga News Agency',
|
'name' => 'Belga News Agency',
|
||||||
'description' => 'Belgian national news agency',
|
'description' => 'Belgian national news agency',
|
||||||
'type' => 'rss',
|
'type' => 'website',
|
||||||
'is_active' => true,
|
'is_active' => true,
|
||||||
'languages' => [
|
'languages' => [
|
||||||
'en' => ['url' => 'https://www.belganewsagency.eu/'],
|
'en' => ['url' => 'https://www.belganewsagency.eu/'],
|
||||||
|
|
@ -44,6 +44,20 @@
|
||||||
'article_page' => \App\Services\Parsers\BelgaArticlePageParser::class,
|
'article_page' => \App\Services\Parsers\BelgaArticlePageParser::class,
|
||||||
],
|
],
|
||||||
],
|
],
|
||||||
|
'guardian' => [
|
||||||
|
'code' => 'guardian',
|
||||||
|
'name' => 'The Guardian',
|
||||||
|
'description' => 'British daily newspaper',
|
||||||
|
'type' => 'rss',
|
||||||
|
'is_active' => true,
|
||||||
|
'languages' => [
|
||||||
|
'en' => ['url' => 'https://www.theguardian.com/international/rss'],
|
||||||
|
],
|
||||||
|
'parsers' => [
|
||||||
|
'article' => \App\Services\Parsers\GuardianArticleParser::class,
|
||||||
|
'article_page' => \App\Services\Parsers\GuardianArticlePageParser::class,
|
||||||
|
],
|
||||||
|
],
|
||||||
],
|
],
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
|
|
||||||
|
|
@ -99,7 +99,7 @@ public function test_store_creates_belga_feed_successfully(): void
|
||||||
'data' => [
|
'data' => [
|
||||||
'name' => 'Belga Test Feed',
|
'name' => 'Belga Test Feed',
|
||||||
'url' => 'https://www.belganewsagency.eu/',
|
'url' => 'https://www.belganewsagency.eu/',
|
||||||
'type' => 'rss',
|
'type' => 'website',
|
||||||
'is_active' => true,
|
'is_active' => true,
|
||||||
]
|
]
|
||||||
]);
|
]);
|
||||||
|
|
@ -107,6 +107,38 @@ public function test_store_creates_belga_feed_successfully(): void
|
||||||
$this->assertDatabaseHas('feeds', [
|
$this->assertDatabaseHas('feeds', [
|
||||||
'name' => 'Belga Test Feed',
|
'name' => 'Belga Test Feed',
|
||||||
'url' => 'https://www.belganewsagency.eu/',
|
'url' => 'https://www.belganewsagency.eu/',
|
||||||
|
'type' => 'website',
|
||||||
|
]);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function test_store_creates_guardian_feed_successfully(): void
|
||||||
|
{
|
||||||
|
$language = Language::factory()->english()->create();
|
||||||
|
|
||||||
|
$feedData = [
|
||||||
|
'name' => 'Guardian Test Feed',
|
||||||
|
'provider' => 'guardian',
|
||||||
|
'language_id' => $language->id,
|
||||||
|
'is_active' => true,
|
||||||
|
];
|
||||||
|
|
||||||
|
$response = $this->postJson('/api/v1/feeds', $feedData);
|
||||||
|
|
||||||
|
$response->assertStatus(201)
|
||||||
|
->assertJson([
|
||||||
|
'success' => true,
|
||||||
|
'message' => 'Feed created successfully!',
|
||||||
|
'data' => [
|
||||||
|
'name' => 'Guardian Test Feed',
|
||||||
|
'url' => 'https://www.theguardian.com/international/rss',
|
||||||
|
'type' => 'rss',
|
||||||
|
'is_active' => true,
|
||||||
|
]
|
||||||
|
]);
|
||||||
|
|
||||||
|
$this->assertDatabaseHas('feeds', [
|
||||||
|
'name' => 'Guardian Test Feed',
|
||||||
|
'url' => 'https://www.theguardian.com/international/rss',
|
||||||
'type' => 'rss',
|
'type' => 'rss',
|
||||||
]);
|
]);
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -43,11 +43,23 @@ public function test_creates_belga_feed_with_correct_url(): void
|
||||||
$feed = $this->action->execute('Belga News', 'belga', $language->id);
|
$feed = $this->action->execute('Belga News', 'belga', $language->id);
|
||||||
|
|
||||||
$this->assertEquals('https://www.belganewsagency.eu/', $feed->url);
|
$this->assertEquals('https://www.belganewsagency.eu/', $feed->url);
|
||||||
$this->assertEquals('rss', $feed->type);
|
$this->assertEquals('website', $feed->type);
|
||||||
$this->assertEquals('belga', $feed->provider);
|
$this->assertEquals('belga', $feed->provider);
|
||||||
$this->assertNull($feed->description);
|
$this->assertNull($feed->description);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public function test_creates_guardian_feed_with_correct_url(): void
|
||||||
|
{
|
||||||
|
$language = Language::factory()->create(['short_code' => 'en', 'is_active' => true]);
|
||||||
|
|
||||||
|
$feed = $this->action->execute('Guardian News', 'guardian', $language->id);
|
||||||
|
|
||||||
|
$this->assertEquals('https://www.theguardian.com/international/rss', $feed->url);
|
||||||
|
$this->assertEquals('rss', $feed->type);
|
||||||
|
$this->assertEquals('guardian', $feed->provider);
|
||||||
|
$this->assertNull($feed->description);
|
||||||
|
}
|
||||||
|
|
||||||
public function test_creates_vrt_feed_with_dutch_language(): void
|
public function test_creates_vrt_feed_with_dutch_language(): void
|
||||||
{
|
{
|
||||||
$language = Language::factory()->create(['short_code' => 'nl', 'is_active' => true]);
|
$language = Language::factory()->create(['short_code' => 'nl', 'is_active' => true]);
|
||||||
|
|
|
||||||
164
tests/Unit/Services/ArticleFetcherRssTest.php
Normal file
164
tests/Unit/Services/ArticleFetcherRssTest.php
Normal file
|
|
@ -0,0 +1,164 @@
|
||||||
|
<?php
|
||||||
|
|
||||||
|
namespace Tests\Unit\Services;
|
||||||
|
|
||||||
|
use App\Models\Article;
|
||||||
|
use App\Models\Feed;
|
||||||
|
use Illuminate\Foundation\Testing\RefreshDatabase;
|
||||||
|
use Illuminate\Support\Facades\Http;
|
||||||
|
use Mockery;
|
||||||
|
use Tests\TestCase;
|
||||||
|
use Tests\Traits\CreatesArticleFetcher;
|
||||||
|
|
||||||
|
class ArticleFetcherRssTest extends TestCase
|
||||||
|
{
|
||||||
|
use RefreshDatabase, CreatesArticleFetcher;
|
||||||
|
|
||||||
|
private string $sampleRss;
|
||||||
|
|
||||||
|
protected function setUp(): void
|
||||||
|
{
|
||||||
|
parent::setUp();
|
||||||
|
|
||||||
|
$this->sampleRss = <<<'XML'
|
||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<rss version="2.0">
|
||||||
|
<channel>
|
||||||
|
<title>The Guardian - International</title>
|
||||||
|
<link>https://www.theguardian.com/international</link>
|
||||||
|
<item>
|
||||||
|
<title>First Article Title</title>
|
||||||
|
<link>https://www.theguardian.com/world/2026/mar/08/first-article</link>
|
||||||
|
<description>First article description</description>
|
||||||
|
<pubDate>Sun, 08 Mar 2026 12:00:00 GMT</pubDate>
|
||||||
|
</item>
|
||||||
|
<item>
|
||||||
|
<title>Second Article Title</title>
|
||||||
|
<link>https://www.theguardian.com/world/2026/mar/08/second-article</link>
|
||||||
|
<description>Second article description</description>
|
||||||
|
<pubDate>Sun, 08 Mar 2026 11:00:00 GMT</pubDate>
|
||||||
|
</item>
|
||||||
|
</channel>
|
||||||
|
</rss>
|
||||||
|
XML;
|
||||||
|
}
|
||||||
|
|
||||||
|
public function test_get_articles_from_rss_feed_returns_collection(): void
|
||||||
|
{
|
||||||
|
Http::fake(['*' => Http::response($this->sampleRss, 200)]);
|
||||||
|
|
||||||
|
$feed = Feed::factory()->create([
|
||||||
|
'type' => 'rss',
|
||||||
|
'provider' => 'guardian',
|
||||||
|
'url' => 'https://www.theguardian.com/international/rss',
|
||||||
|
]);
|
||||||
|
|
||||||
|
$fetcher = $this->createArticleFetcher();
|
||||||
|
$result = $fetcher->getArticlesFromFeed($feed);
|
||||||
|
|
||||||
|
$this->assertInstanceOf(\Illuminate\Support\Collection::class, $result);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function test_get_articles_from_rss_feed_creates_articles(): void
|
||||||
|
{
|
||||||
|
Http::fake(['*' => Http::response($this->sampleRss, 200)]);
|
||||||
|
|
||||||
|
$feed = Feed::factory()->create([
|
||||||
|
'type' => 'rss',
|
||||||
|
'provider' => 'guardian',
|
||||||
|
'url' => 'https://www.theguardian.com/international/rss',
|
||||||
|
]);
|
||||||
|
|
||||||
|
$fetcher = $this->createArticleFetcher();
|
||||||
|
$result = $fetcher->getArticlesFromFeed($feed);
|
||||||
|
|
||||||
|
$this->assertCount(2, $result);
|
||||||
|
$this->assertDatabaseHas('articles', [
|
||||||
|
'url' => 'https://www.theguardian.com/world/2026/mar/08/first-article',
|
||||||
|
'feed_id' => $feed->id,
|
||||||
|
]);
|
||||||
|
$this->assertDatabaseHas('articles', [
|
||||||
|
'url' => 'https://www.theguardian.com/world/2026/mar/08/second-article',
|
||||||
|
'feed_id' => $feed->id,
|
||||||
|
]);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function test_get_articles_from_rss_feed_does_not_duplicate_existing(): void
|
||||||
|
{
|
||||||
|
Http::fake(['*' => Http::response($this->sampleRss, 200)]);
|
||||||
|
|
||||||
|
$feed = Feed::factory()->create([
|
||||||
|
'type' => 'rss',
|
||||||
|
'provider' => 'guardian',
|
||||||
|
'url' => 'https://www.theguardian.com/international/rss',
|
||||||
|
]);
|
||||||
|
|
||||||
|
Article::factory()->create([
|
||||||
|
'url' => 'https://www.theguardian.com/world/2026/mar/08/first-article',
|
||||||
|
'feed_id' => $feed->id,
|
||||||
|
]);
|
||||||
|
|
||||||
|
$fetcher = $this->createArticleFetcher();
|
||||||
|
$result = $fetcher->getArticlesFromFeed($feed);
|
||||||
|
|
||||||
|
$this->assertCount(2, $result);
|
||||||
|
$this->assertEquals(1, Article::where('url', 'https://www.theguardian.com/world/2026/mar/08/first-article')->count());
|
||||||
|
}
|
||||||
|
|
||||||
|
public function test_get_articles_from_rss_feed_handles_invalid_xml(): void
|
||||||
|
{
|
||||||
|
Http::fake(['*' => Http::response('this is not xml', 200)]);
|
||||||
|
|
||||||
|
$feed = Feed::factory()->create([
|
||||||
|
'type' => 'rss',
|
||||||
|
'provider' => 'guardian',
|
||||||
|
'url' => 'https://www.theguardian.com/international/rss',
|
||||||
|
]);
|
||||||
|
|
||||||
|
$fetcher = $this->createArticleFetcher();
|
||||||
|
$result = $fetcher->getArticlesFromFeed($feed);
|
||||||
|
|
||||||
|
$this->assertInstanceOf(\Illuminate\Support\Collection::class, $result);
|
||||||
|
$this->assertEmpty($result);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function test_get_articles_from_rss_feed_handles_empty_channel(): void
|
||||||
|
{
|
||||||
|
Http::fake([
|
||||||
|
'*' => Http::response('<?xml version="1.0"?><rss><channel><title>Empty</title></channel></rss>', 200),
|
||||||
|
]);
|
||||||
|
|
||||||
|
$feed = Feed::factory()->create([
|
||||||
|
'type' => 'rss',
|
||||||
|
'provider' => 'guardian',
|
||||||
|
'url' => 'https://www.theguardian.com/international/rss',
|
||||||
|
]);
|
||||||
|
|
||||||
|
$fetcher = $this->createArticleFetcher();
|
||||||
|
$result = $fetcher->getArticlesFromFeed($feed);
|
||||||
|
|
||||||
|
$this->assertEmpty($result);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function test_get_articles_from_rss_feed_handles_http_failure(): void
|
||||||
|
{
|
||||||
|
Http::fake(['*' => Http::response('Server Error', 500)]);
|
||||||
|
|
||||||
|
$feed = Feed::factory()->create([
|
||||||
|
'type' => 'rss',
|
||||||
|
'provider' => 'guardian',
|
||||||
|
'url' => 'https://www.theguardian.com/international/rss',
|
||||||
|
]);
|
||||||
|
|
||||||
|
$fetcher = $this->createArticleFetcher();
|
||||||
|
$result = $fetcher->getArticlesFromFeed($feed);
|
||||||
|
|
||||||
|
$this->assertEmpty($result);
|
||||||
|
}
|
||||||
|
|
||||||
|
protected function tearDown(): void
|
||||||
|
{
|
||||||
|
Mockery::close();
|
||||||
|
parent::tearDown();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -46,9 +46,10 @@ public function test_get_supported_sources_returns_array_of_source_names(): void
|
||||||
$sources = ArticleParserFactory::getSupportedSources();
|
$sources = ArticleParserFactory::getSupportedSources();
|
||||||
|
|
||||||
$this->assertIsArray($sources);
|
$this->assertIsArray($sources);
|
||||||
$this->assertCount(2, $sources);
|
$this->assertCount(3, $sources);
|
||||||
$this->assertContains('VRT News', $sources);
|
$this->assertContains('VRT News', $sources);
|
||||||
$this->assertContains('Belga News Agency', $sources);
|
$this->assertContains('Belga News Agency', $sources);
|
||||||
|
$this->assertContains('The Guardian', $sources);
|
||||||
}
|
}
|
||||||
|
|
||||||
public function test_get_supported_sources_returns_sources_in_correct_order(): void
|
public function test_get_supported_sources_returns_sources_in_correct_order(): void
|
||||||
|
|
@ -88,7 +89,7 @@ public function getSourceName(): string
|
||||||
// Verify it's now included in supported sources
|
// Verify it's now included in supported sources
|
||||||
$sources = ArticleParserFactory::getSupportedSources();
|
$sources = ArticleParserFactory::getSupportedSources();
|
||||||
$this->assertContains('TestParser', $sources);
|
$this->assertContains('TestParser', $sources);
|
||||||
$this->assertCount(3, $sources); // Original 2 + 1 new
|
$this->assertCount(4, $sources); // Original 3 + 1 new
|
||||||
|
|
||||||
// Verify it can be used to parse URLs
|
// Verify it can be used to parse URLs
|
||||||
$testUrl = 'https://test-parser.com/article';
|
$testUrl = 'https://test-parser.com/article';
|
||||||
|
|
|
||||||
285
tests/Unit/Services/Parsers/GuardianArticlePageParserTest.php
Normal file
285
tests/Unit/Services/Parsers/GuardianArticlePageParserTest.php
Normal file
|
|
@ -0,0 +1,285 @@
|
||||||
|
<?php
|
||||||
|
|
||||||
|
namespace Tests\Unit\Services\Parsers;
|
||||||
|
|
||||||
|
use App\Services\Parsers\GuardianArticlePageParser;
|
||||||
|
use Tests\TestCase;
|
||||||
|
|
||||||
|
class GuardianArticlePageParserTest extends TestCase
|
||||||
|
{
|
||||||
|
public function test_extract_title_from_og_meta_tag(): void
|
||||||
|
{
|
||||||
|
$html = '<html><head><meta property="og:title" content="Guardian Article Title"/></head><body></body></html>';
|
||||||
|
|
||||||
|
$title = GuardianArticlePageParser::extractTitle($html);
|
||||||
|
|
||||||
|
$this->assertEquals('Guardian Article Title', $title);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function test_extract_title_from_h1_tag(): void
|
||||||
|
{
|
||||||
|
$html = '<html><body><h1>H1 Title Test</h1></body></html>';
|
||||||
|
|
||||||
|
$title = GuardianArticlePageParser::extractTitle($html);
|
||||||
|
|
||||||
|
$this->assertEquals('H1 Title Test', $title);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function test_extract_title_from_title_tag(): void
|
||||||
|
{
|
||||||
|
$html = '<html><head><title>Page Title Test</title></head><body></body></html>';
|
||||||
|
|
||||||
|
$title = GuardianArticlePageParser::extractTitle($html);
|
||||||
|
|
||||||
|
$this->assertEquals('Page Title Test', $title);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function test_extract_title_with_html_entities(): void
|
||||||
|
{
|
||||||
|
$html = '<html><head><meta property="og:title" content="Test & Article "Title""/></head></html>';
|
||||||
|
|
||||||
|
$title = GuardianArticlePageParser::extractTitle($html);
|
||||||
|
|
||||||
|
$this->assertEquals('Test & Article "Title"', $title);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function test_extract_title_returns_null_when_not_found(): void
|
||||||
|
{
|
||||||
|
$html = '<html><body><p>No title here</p></body></html>';
|
||||||
|
|
||||||
|
$title = GuardianArticlePageParser::extractTitle($html);
|
||||||
|
|
||||||
|
$this->assertNull($title);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function test_extract_description_from_og_meta_tag(): void
|
||||||
|
{
|
||||||
|
$html = '<html><head><meta property="og:description" content="Guardian article description"/></head></html>';
|
||||||
|
|
||||||
|
$description = GuardianArticlePageParser::extractDescription($html);
|
||||||
|
|
||||||
|
$this->assertEquals('Guardian article description', $description);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function test_extract_description_from_paragraph(): void
|
||||||
|
{
|
||||||
|
$html = '<html><body><p>This is the first paragraph description.</p></body></html>';
|
||||||
|
|
||||||
|
$description = GuardianArticlePageParser::extractDescription($html);
|
||||||
|
|
||||||
|
$this->assertEquals('This is the first paragraph description.', $description);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function test_extract_description_returns_null_when_not_found(): void
|
||||||
|
{
|
||||||
|
$html = '<html><body><div>No description here</div></body></html>';
|
||||||
|
|
||||||
|
$description = GuardianArticlePageParser::extractDescription($html);
|
||||||
|
|
||||||
|
$this->assertNull($description);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function test_extract_full_article_from_guardian_article_body(): void
|
||||||
|
{
|
||||||
|
$html = '
|
||||||
|
<html>
|
||||||
|
<body>
|
||||||
|
<div class="article-body-commercial-selector">
|
||||||
|
<p>First paragraph of the article.</p>
|
||||||
|
<p>Second paragraph of the article.</p>
|
||||||
|
</div>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
';
|
||||||
|
|
||||||
|
$fullArticle = GuardianArticlePageParser::extractFullArticle($html);
|
||||||
|
|
||||||
|
$expected = "First paragraph of the article.\n\nSecond paragraph of the article.";
|
||||||
|
$this->assertEquals($expected, $fullArticle);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function test_extract_full_article_fallback_to_all_paragraphs(): void
|
||||||
|
{
|
||||||
|
$html = '
|
||||||
|
<html>
|
||||||
|
<body>
|
||||||
|
<p>First general paragraph.</p>
|
||||||
|
<p>Second general paragraph.</p>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
';
|
||||||
|
|
||||||
|
$fullArticle = GuardianArticlePageParser::extractFullArticle($html);
|
||||||
|
|
||||||
|
$expected = "First general paragraph.\n\nSecond general paragraph.";
|
||||||
|
$this->assertEquals($expected, $fullArticle);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function test_extract_full_article_filters_empty_paragraphs(): void
|
||||||
|
{
|
||||||
|
$html = '
|
||||||
|
<html>
|
||||||
|
<body>
|
||||||
|
<div class="article-body-commercial-selector">
|
||||||
|
<p>Content paragraph.</p>
|
||||||
|
<p> </p>
|
||||||
|
<p></p>
|
||||||
|
<p>Another content paragraph.</p>
|
||||||
|
</div>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
';
|
||||||
|
|
||||||
|
$fullArticle = GuardianArticlePageParser::extractFullArticle($html);
|
||||||
|
|
||||||
|
$expected = "Content paragraph.\n\nAnother content paragraph.";
|
||||||
|
$this->assertEquals($expected, $fullArticle);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function test_extract_full_article_handles_nested_tags(): void
|
||||||
|
{
|
||||||
|
$html = '
|
||||||
|
<html>
|
||||||
|
<body>
|
||||||
|
<div class="article-body-commercial-selector">
|
||||||
|
<p>This has <strong>bold text</strong> and <em>italic text</em>.</p>
|
||||||
|
<p>This has <a href="#">a link</a> inside.</p>
|
||||||
|
</div>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
';
|
||||||
|
|
||||||
|
$fullArticle = GuardianArticlePageParser::extractFullArticle($html);
|
||||||
|
|
||||||
|
$expected = "This has bold text and italic text.\n\nThis has a link inside.";
|
||||||
|
$this->assertEquals($expected, $fullArticle);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function test_extract_full_article_removes_scripts_and_styles(): void
|
||||||
|
{
|
||||||
|
$html = '
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<script>console.log("test");</script>
|
||||||
|
<style>.test { color: red; }</style>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<div class="article-body-commercial-selector">
|
||||||
|
<p>Clean content.</p>
|
||||||
|
</div>
|
||||||
|
<script>alert("bad");</script>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
';
|
||||||
|
|
||||||
|
$fullArticle = GuardianArticlePageParser::extractFullArticle($html);
|
||||||
|
|
||||||
|
$this->assertEquals('Clean content.', $fullArticle);
|
||||||
|
$this->assertStringNotContainsString('console.log', $fullArticle);
|
||||||
|
$this->assertStringNotContainsString('alert', $fullArticle);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function test_extract_full_article_returns_null_when_no_content(): void
|
||||||
|
{
|
||||||
|
$html = '<html><body><div>No paragraphs here</div></body></html>';
|
||||||
|
|
||||||
|
$fullArticle = GuardianArticlePageParser::extractFullArticle($html);
|
||||||
|
|
||||||
|
$this->assertNull($fullArticle);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function test_extract_thumbnail_from_og_image(): void
|
||||||
|
{
|
||||||
|
$html = '<html><head><meta property="og:image" content="https://i.guim.co.uk/img/test.jpg"/></head></html>';
|
||||||
|
|
||||||
|
$thumbnail = GuardianArticlePageParser::extractThumbnail($html);
|
||||||
|
|
||||||
|
$this->assertEquals('https://i.guim.co.uk/img/test.jpg', $thumbnail);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function test_extract_thumbnail_from_img_tag(): void
|
||||||
|
{
|
||||||
|
$html = '<html><body><img src="https://i.guim.co.uk/img/article-image.png" alt="test"/></body></html>';
|
||||||
|
|
||||||
|
$thumbnail = GuardianArticlePageParser::extractThumbnail($html);
|
||||||
|
|
||||||
|
$this->assertEquals('https://i.guim.co.uk/img/article-image.png', $thumbnail);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function test_extract_thumbnail_returns_null_when_not_found(): void
|
||||||
|
{
|
||||||
|
$html = '<html><body><div>No images here</div></body></html>';
|
||||||
|
|
||||||
|
$thumbnail = GuardianArticlePageParser::extractThumbnail($html);
|
||||||
|
|
||||||
|
$this->assertNull($thumbnail);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function test_extract_data_returns_all_components(): void
|
||||||
|
{
|
||||||
|
$html = '
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<meta property="og:title" content="Guardian Test Article"/>
|
||||||
|
<meta property="og:description" content="Test description"/>
|
||||||
|
<meta property="og:image" content="https://i.guim.co.uk/img/image.jpg"/>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<div class="article-body-commercial-selector">
|
||||||
|
<p>Full article content here.</p>
|
||||||
|
</div>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
';
|
||||||
|
|
||||||
|
$data = GuardianArticlePageParser::extractData($html);
|
||||||
|
|
||||||
|
$this->assertIsArray($data);
|
||||||
|
$this->assertArrayHasKey('title', $data);
|
||||||
|
$this->assertArrayHasKey('description', $data);
|
||||||
|
$this->assertArrayHasKey('full_article', $data);
|
||||||
|
$this->assertArrayHasKey('thumbnail', $data);
|
||||||
|
|
||||||
|
$this->assertEquals('Guardian Test Article', $data['title']);
|
||||||
|
$this->assertEquals('Test description', $data['description']);
|
||||||
|
$this->assertEquals('Full article content here.', $data['full_article']);
|
||||||
|
$this->assertEquals('https://i.guim.co.uk/img/image.jpg', $data['thumbnail']);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function test_extract_data_handles_missing_components_gracefully(): void
|
||||||
|
{
|
||||||
|
$html = '<html><body><div>Minimal content</div></body></html>';
|
||||||
|
|
||||||
|
$data = GuardianArticlePageParser::extractData($html);
|
||||||
|
|
||||||
|
$this->assertIsArray($data);
|
||||||
|
$this->assertNull($data['title']);
|
||||||
|
$this->assertNull($data['description']);
|
||||||
|
$this->assertNull($data['full_article']);
|
||||||
|
$this->assertNull($data['thumbnail']);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function test_extract_full_article_with_realistic_guardian_html(): void
|
||||||
|
{
|
||||||
|
$html = '
|
||||||
|
<html>
|
||||||
|
<body>
|
||||||
|
<div class="article-body-commercial-selector">
|
||||||
|
<p><strong>The prime minister has announced a new climate policy that aims to reduce carbon emissions by 50% by 2030.</strong></p>
|
||||||
|
<p>The announcement came during a press conference at Downing Street on Tuesday afternoon.</p>
|
||||||
|
<p>Environmental groups have cautiously welcomed the move, while industry leaders have expressed concern about the timeline.</p>
|
||||||
|
</div>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
';
|
||||||
|
|
||||||
|
$fullArticle = GuardianArticlePageParser::extractFullArticle($html);
|
||||||
|
|
||||||
|
$this->assertNotNull($fullArticle);
|
||||||
|
$this->assertStringContainsString('climate policy', $fullArticle);
|
||||||
|
$this->assertStringContainsString('press conference', $fullArticle);
|
||||||
|
$this->assertStringContainsString('Environmental groups', $fullArticle);
|
||||||
|
$this->assertStringContainsString("\n\n", $fullArticle);
|
||||||
|
$this->assertStringNotContainsString('<strong>', $fullArticle);
|
||||||
|
}
|
||||||
|
}
|
||||||
63
tests/Unit/Services/Parsers/GuardianArticleParserTest.php
Normal file
63
tests/Unit/Services/Parsers/GuardianArticleParserTest.php
Normal file
|
|
@ -0,0 +1,63 @@
|
||||||
|
<?php
|
||||||
|
|
||||||
|
namespace Tests\Unit\Services\Parsers;
|
||||||
|
|
||||||
|
use App\Contracts\ArticleParserInterface;
|
||||||
|
use App\Services\Parsers\GuardianArticleParser;
|
||||||
|
use Tests\TestCase;
|
||||||
|
|
||||||
|
class GuardianArticleParserTest extends TestCase
|
||||||
|
{
|
||||||
|
private GuardianArticleParser $parser;
|
||||||
|
|
||||||
|
protected function setUp(): void
|
||||||
|
{
|
||||||
|
parent::setUp();
|
||||||
|
$this->parser = new GuardianArticleParser();
|
||||||
|
}
|
||||||
|
|
||||||
|
public function test_implements_article_parser_interface(): void
|
||||||
|
{
|
||||||
|
$this->assertInstanceOf(ArticleParserInterface::class, $this->parser);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function test_can_parse_guardian_url(): void
|
||||||
|
{
|
||||||
|
$this->assertTrue($this->parser->canParse('https://www.theguardian.com/world/2026/mar/08/some-article'));
|
||||||
|
}
|
||||||
|
|
||||||
|
public function test_can_parse_guardian_url_without_www(): void
|
||||||
|
{
|
||||||
|
$this->assertTrue($this->parser->canParse('https://theguardian.com/world/2026/mar/08/some-article'));
|
||||||
|
}
|
||||||
|
|
||||||
|
public function test_cannot_parse_non_guardian_url(): void
|
||||||
|
{
|
||||||
|
$this->assertFalse($this->parser->canParse('https://www.vrt.be/vrtnws/en/article'));
|
||||||
|
$this->assertFalse($this->parser->canParse('https://www.belganewsagency.eu/article'));
|
||||||
|
}
|
||||||
|
|
||||||
|
public function test_get_source_name(): void
|
||||||
|
{
|
||||||
|
$this->assertEquals('The Guardian', $this->parser->getSourceName());
|
||||||
|
}
|
||||||
|
|
||||||
|
public function test_extract_data_delegates_to_page_parser(): void
|
||||||
|
{
|
||||||
|
$html = '
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<meta property="og:title" content="Test Title"/>
|
||||||
|
<meta property="og:description" content="Test Description"/>
|
||||||
|
</head>
|
||||||
|
<body><p>Content</p></body>
|
||||||
|
</html>
|
||||||
|
';
|
||||||
|
|
||||||
|
$data = $this->parser->extractData($html);
|
||||||
|
|
||||||
|
$this->assertIsArray($data);
|
||||||
|
$this->assertArrayHasKey('title', $data);
|
||||||
|
$this->assertEquals('Test Title', $data['title']);
|
||||||
|
}
|
||||||
|
}
|
||||||
Loading…
Reference in a new issue