From 0bb10729deec19c792721eedc0e8cf8f590a3662 Mon Sep 17 00:00:00 2001 From: myrmidex Date: Sun, 8 Mar 2026 17:19:11 +0100 Subject: [PATCH] 78 - Convert Belga from website scraping to RSS feed parsing --- app/Services/Parsers/BelgaHomepageParser.php | 49 ------------------- .../Parsers/BelgaHomepageParserAdapter.php | 37 -------------- config/feed.php | 5 +- database/factories/FeedFactory.php | 3 +- .../Api/V1/FeedsControllerTest.php | 8 +-- tests/Unit/Actions/CreateFeedActionTest.php | 4 +- tests/Unit/Services/ArticleFetcherRssTest.php | 46 +++++++++++++++++ 7 files changed, 56 insertions(+), 96 deletions(-) delete mode 100644 app/Services/Parsers/BelgaHomepageParser.php delete mode 100644 app/Services/Parsers/BelgaHomepageParserAdapter.php diff --git a/app/Services/Parsers/BelgaHomepageParser.php b/app/Services/Parsers/BelgaHomepageParser.php deleted file mode 100644 index 55dc131..0000000 --- a/app/Services/Parsers/BelgaHomepageParser.php +++ /dev/null @@ -1,49 +0,0 @@ - - */ - public static function extractArticleUrls(string $html): array - { - // Find all relative article links (most articles use relative paths) - preg_match_all('/]+href="(\/[a-z0-9-]+)"/', $html, $matches); - - // Blacklist of non-article paths - $blacklistPaths = [ - '/', - '/de', - '/feed', - '/search', - '/category', - '/about', - '/contact', - '/privacy', - '/terms', - ]; - - $urls = collect($matches[1]) - ->unique() - ->filter(function ($path) use ($blacklistPaths) { - // Exclude exact matches and paths starting with blacklisted paths - foreach ($blacklistPaths as $blacklistedPath) { - if ($path === $blacklistedPath || str_starts_with($path, $blacklistedPath.'/')) { - return false; - } - } - - return true; - }) - ->map(function ($path) { - // Convert relative paths to absolute URLs - return 'https://www.belganewsagency.eu'.$path; - }) - ->values() - ->toArray(); - - return $urls; - } -} diff --git a/app/Services/Parsers/BelgaHomepageParserAdapter.php b/app/Services/Parsers/BelgaHomepageParserAdapter.php deleted file mode 100644 index 0bc9bb6..0000000 --- a/app/Services/Parsers/BelgaHomepageParserAdapter.php +++ /dev/null @@ -1,37 +0,0 @@ -language; - } - - public function canParse(string $url): bool - { - return str_contains($url, 'belganewsagency.eu'); - } - - public function extractArticleUrls(string $html): array - { - return BelgaHomepageParser::extractArticleUrls($html); - } - - public function getHomepageUrl(): string - { - return 'https://www.belganewsagency.eu/'; - } - - public function getSourceName(): string - { - return 'Belga News Agency'; - } -} diff --git a/config/feed.php b/config/feed.php index c6cc819..73227fe 100644 --- a/config/feed.php +++ b/config/feed.php @@ -33,13 +33,12 @@ 'code' => 'belga', 'name' => 'Belga News Agency', 'description' => 'Belgian national news agency', - 'type' => 'website', + 'type' => 'rss', 'is_active' => true, 'languages' => [ - 'en' => ['url' => 'https://www.belganewsagency.eu/'], + 'en' => ['url' => 'https://www.belganewsagency.eu/feed'], ], 'parsers' => [ - 'homepage' => \App\Services\Parsers\BelgaHomepageParserAdapter::class, 'article' => \App\Services\Parsers\BelgaArticleParser::class, 'article_page' => \App\Services\Parsers\BelgaArticlePageParser::class, ], diff --git a/database/factories/FeedFactory.php b/database/factories/FeedFactory.php index 06c903c..fc0cb80 100644 --- a/database/factories/FeedFactory.php +++ b/database/factories/FeedFactory.php @@ -75,7 +75,8 @@ public function belga(): static { return $this->state(fn (array $attributes) => [ 'provider' => 'belga', - 'url' => 'https://www.belganewsagency.eu/', + 'url' => 'https://www.belganewsagency.eu/feed', + 'type' => 'rss', ]); } } diff --git a/tests/Feature/Http/Controllers/Api/V1/FeedsControllerTest.php b/tests/Feature/Http/Controllers/Api/V1/FeedsControllerTest.php index d62d502..0acac2b 100644 --- a/tests/Feature/Http/Controllers/Api/V1/FeedsControllerTest.php +++ b/tests/Feature/Http/Controllers/Api/V1/FeedsControllerTest.php @@ -98,16 +98,16 @@ public function test_store_creates_belga_feed_successfully(): void 'message' => 'Feed created successfully!', 'data' => [ 'name' => 'Belga Test Feed', - 'url' => 'https://www.belganewsagency.eu/', - 'type' => 'website', + 'url' => 'https://www.belganewsagency.eu/feed', + 'type' => 'rss', 'is_active' => true, ], ]); $this->assertDatabaseHas('feeds', [ 'name' => 'Belga Test Feed', - 'url' => 'https://www.belganewsagency.eu/', - 'type' => 'website', + 'url' => 'https://www.belganewsagency.eu/feed', + 'type' => 'rss', ]); } diff --git a/tests/Unit/Actions/CreateFeedActionTest.php b/tests/Unit/Actions/CreateFeedActionTest.php index 3cc6013..accb3f5 100644 --- a/tests/Unit/Actions/CreateFeedActionTest.php +++ b/tests/Unit/Actions/CreateFeedActionTest.php @@ -42,8 +42,8 @@ public function test_creates_belga_feed_with_correct_url(): void $feed = $this->action->execute('Belga News', 'belga', $language->id); - $this->assertEquals('https://www.belganewsagency.eu/', $feed->url); - $this->assertEquals('website', $feed->type); + $this->assertEquals('https://www.belganewsagency.eu/feed', $feed->url); + $this->assertEquals('rss', $feed->type); $this->assertEquals('belga', $feed->provider); $this->assertNull($feed->description); } diff --git a/tests/Unit/Services/ArticleFetcherRssTest.php b/tests/Unit/Services/ArticleFetcherRssTest.php index 3efab83..425ae23 100644 --- a/tests/Unit/Services/ArticleFetcherRssTest.php +++ b/tests/Unit/Services/ArticleFetcherRssTest.php @@ -156,6 +156,52 @@ public function test_get_articles_from_rss_feed_handles_http_failure(): void $this->assertEmpty($result); } + public function test_get_articles_from_belga_rss_feed_creates_articles(): void + { + $belgaRss = <<<'XML' + + + + Belga News Agency + https://www.belganewsagency.eu + + Belgium announces new climate plan + https://www.belganewsagency.eu/belgium-announces-new-climate-plan + Belgium has unveiled a comprehensive climate strategy. + Sun, 08 Mar 2026 10:00:00 GMT + + + EU summit concludes in Brussels + https://www.belganewsagency.eu/eu-summit-concludes-in-brussels + European leaders reached agreement on key issues. + Sun, 08 Mar 2026 09:00:00 GMT + + + +XML; + + Http::fake(['*' => Http::response($belgaRss, 200)]); + + $feed = Feed::factory()->create([ + 'type' => 'rss', + 'provider' => 'belga', + 'url' => 'https://www.belganewsagency.eu/feed', + ]); + + $fetcher = $this->createArticleFetcher(); + $result = $fetcher->getArticlesFromFeed($feed); + + $this->assertCount(2, $result); + $this->assertDatabaseHas('articles', [ + 'url' => 'https://www.belganewsagency.eu/belgium-announces-new-climate-plan', + 'feed_id' => $feed->id, + ]); + $this->assertDatabaseHas('articles', [ + 'url' => 'https://www.belganewsagency.eu/eu-summit-concludes-in-brussels', + 'feed_id' => $feed->id, + ]); + } + protected function tearDown(): void { Mockery::close();