78 - Convert Belga from website scraping to RSS feed parsing
This commit is contained in:
parent
19cbea9273
commit
0bb10729de
7 changed files with 56 additions and 96 deletions
|
|
@ -1,49 +0,0 @@
|
|||
<?php
|
||||
|
||||
namespace App\Services\Parsers;
|
||||
|
||||
class BelgaHomepageParser
|
||||
{
|
||||
/**
|
||||
* @return array<int, string>
|
||||
*/
|
||||
public static function extractArticleUrls(string $html): array
|
||||
{
|
||||
// Find all relative article links (most articles use relative paths)
|
||||
preg_match_all('/<a[^>]+href="(\/[a-z0-9-]+)"/', $html, $matches);
|
||||
|
||||
// Blacklist of non-article paths
|
||||
$blacklistPaths = [
|
||||
'/',
|
||||
'/de',
|
||||
'/feed',
|
||||
'/search',
|
||||
'/category',
|
||||
'/about',
|
||||
'/contact',
|
||||
'/privacy',
|
||||
'/terms',
|
||||
];
|
||||
|
||||
$urls = collect($matches[1])
|
||||
->unique()
|
||||
->filter(function ($path) use ($blacklistPaths) {
|
||||
// Exclude exact matches and paths starting with blacklisted paths
|
||||
foreach ($blacklistPaths as $blacklistedPath) {
|
||||
if ($path === $blacklistedPath || str_starts_with($path, $blacklistedPath.'/')) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
})
|
||||
->map(function ($path) {
|
||||
// Convert relative paths to absolute URLs
|
||||
return 'https://www.belganewsagency.eu'.$path;
|
||||
})
|
||||
->values()
|
||||
->toArray();
|
||||
|
||||
return $urls;
|
||||
}
|
||||
}
|
||||
|
|
@ -1,37 +0,0 @@
|
|||
<?php
|
||||
|
||||
namespace App\Services\Parsers;
|
||||
|
||||
use App\Contracts\HomepageParserInterface;
|
||||
|
||||
class BelgaHomepageParserAdapter implements HomepageParserInterface
|
||||
{
|
||||
public function __construct(
|
||||
private readonly string $language = 'en',
|
||||
) {}
|
||||
|
||||
public function getLanguage(): string
|
||||
{
|
||||
return $this->language;
|
||||
}
|
||||
|
||||
public function canParse(string $url): bool
|
||||
{
|
||||
return str_contains($url, 'belganewsagency.eu');
|
||||
}
|
||||
|
||||
public function extractArticleUrls(string $html): array
|
||||
{
|
||||
return BelgaHomepageParser::extractArticleUrls($html);
|
||||
}
|
||||
|
||||
public function getHomepageUrl(): string
|
||||
{
|
||||
return 'https://www.belganewsagency.eu/';
|
||||
}
|
||||
|
||||
public function getSourceName(): string
|
||||
{
|
||||
return 'Belga News Agency';
|
||||
}
|
||||
}
|
||||
|
|
@ -33,13 +33,12 @@
|
|||
'code' => 'belga',
|
||||
'name' => 'Belga News Agency',
|
||||
'description' => 'Belgian national news agency',
|
||||
'type' => 'website',
|
||||
'type' => 'rss',
|
||||
'is_active' => true,
|
||||
'languages' => [
|
||||
'en' => ['url' => 'https://www.belganewsagency.eu/'],
|
||||
'en' => ['url' => 'https://www.belganewsagency.eu/feed'],
|
||||
],
|
||||
'parsers' => [
|
||||
'homepage' => \App\Services\Parsers\BelgaHomepageParserAdapter::class,
|
||||
'article' => \App\Services\Parsers\BelgaArticleParser::class,
|
||||
'article_page' => \App\Services\Parsers\BelgaArticlePageParser::class,
|
||||
],
|
||||
|
|
|
|||
|
|
@ -75,7 +75,8 @@ public function belga(): static
|
|||
{
|
||||
return $this->state(fn (array $attributes) => [
|
||||
'provider' => 'belga',
|
||||
'url' => 'https://www.belganewsagency.eu/',
|
||||
'url' => 'https://www.belganewsagency.eu/feed',
|
||||
'type' => 'rss',
|
||||
]);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -98,16 +98,16 @@ public function test_store_creates_belga_feed_successfully(): void
|
|||
'message' => 'Feed created successfully!',
|
||||
'data' => [
|
||||
'name' => 'Belga Test Feed',
|
||||
'url' => 'https://www.belganewsagency.eu/',
|
||||
'type' => 'website',
|
||||
'url' => 'https://www.belganewsagency.eu/feed',
|
||||
'type' => 'rss',
|
||||
'is_active' => true,
|
||||
],
|
||||
]);
|
||||
|
||||
$this->assertDatabaseHas('feeds', [
|
||||
'name' => 'Belga Test Feed',
|
||||
'url' => 'https://www.belganewsagency.eu/',
|
||||
'type' => 'website',
|
||||
'url' => 'https://www.belganewsagency.eu/feed',
|
||||
'type' => 'rss',
|
||||
]);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -42,8 +42,8 @@ public function test_creates_belga_feed_with_correct_url(): void
|
|||
|
||||
$feed = $this->action->execute('Belga News', 'belga', $language->id);
|
||||
|
||||
$this->assertEquals('https://www.belganewsagency.eu/', $feed->url);
|
||||
$this->assertEquals('website', $feed->type);
|
||||
$this->assertEquals('https://www.belganewsagency.eu/feed', $feed->url);
|
||||
$this->assertEquals('rss', $feed->type);
|
||||
$this->assertEquals('belga', $feed->provider);
|
||||
$this->assertNull($feed->description);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -156,6 +156,52 @@ public function test_get_articles_from_rss_feed_handles_http_failure(): void
|
|||
$this->assertEmpty($result);
|
||||
}
|
||||
|
||||
public function test_get_articles_from_belga_rss_feed_creates_articles(): void
|
||||
{
|
||||
$belgaRss = <<<'XML'
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<rss version="2.0">
|
||||
<channel>
|
||||
<title>Belga News Agency</title>
|
||||
<link>https://www.belganewsagency.eu</link>
|
||||
<item>
|
||||
<title>Belgium announces new climate plan</title>
|
||||
<link>https://www.belganewsagency.eu/belgium-announces-new-climate-plan</link>
|
||||
<description>Belgium has unveiled a comprehensive climate strategy.</description>
|
||||
<pubDate>Sun, 08 Mar 2026 10:00:00 GMT</pubDate>
|
||||
</item>
|
||||
<item>
|
||||
<title>EU summit concludes in Brussels</title>
|
||||
<link>https://www.belganewsagency.eu/eu-summit-concludes-in-brussels</link>
|
||||
<description>European leaders reached agreement on key issues.</description>
|
||||
<pubDate>Sun, 08 Mar 2026 09:00:00 GMT</pubDate>
|
||||
</item>
|
||||
</channel>
|
||||
</rss>
|
||||
XML;
|
||||
|
||||
Http::fake(['*' => Http::response($belgaRss, 200)]);
|
||||
|
||||
$feed = Feed::factory()->create([
|
||||
'type' => 'rss',
|
||||
'provider' => 'belga',
|
||||
'url' => 'https://www.belganewsagency.eu/feed',
|
||||
]);
|
||||
|
||||
$fetcher = $this->createArticleFetcher();
|
||||
$result = $fetcher->getArticlesFromFeed($feed);
|
||||
|
||||
$this->assertCount(2, $result);
|
||||
$this->assertDatabaseHas('articles', [
|
||||
'url' => 'https://www.belganewsagency.eu/belgium-announces-new-climate-plan',
|
||||
'feed_id' => $feed->id,
|
||||
]);
|
||||
$this->assertDatabaseHas('articles', [
|
||||
'url' => 'https://www.belganewsagency.eu/eu-summit-concludes-in-brussels',
|
||||
'feed_id' => $feed->id,
|
||||
]);
|
||||
}
|
||||
|
||||
protected function tearDown(): void
|
||||
{
|
||||
Mockery::close();
|
||||
|
|
|
|||
Loading…
Reference in a new issue