Release v1.2.0 #87
7 changed files with 56 additions and 96 deletions
|
|
@ -1,49 +0,0 @@
|
||||||
<?php
|
|
||||||
|
|
||||||
namespace App\Services\Parsers;
|
|
||||||
|
|
||||||
class BelgaHomepageParser
|
|
||||||
{
|
|
||||||
/**
|
|
||||||
* @return array<int, string>
|
|
||||||
*/
|
|
||||||
public static function extractArticleUrls(string $html): array
|
|
||||||
{
|
|
||||||
// Find all relative article links (most articles use relative paths)
|
|
||||||
preg_match_all('/<a[^>]+href="(\/[a-z0-9-]+)"/', $html, $matches);
|
|
||||||
|
|
||||||
// Blacklist of non-article paths
|
|
||||||
$blacklistPaths = [
|
|
||||||
'/',
|
|
||||||
'/de',
|
|
||||||
'/feed',
|
|
||||||
'/search',
|
|
||||||
'/category',
|
|
||||||
'/about',
|
|
||||||
'/contact',
|
|
||||||
'/privacy',
|
|
||||||
'/terms',
|
|
||||||
];
|
|
||||||
|
|
||||||
$urls = collect($matches[1])
|
|
||||||
->unique()
|
|
||||||
->filter(function ($path) use ($blacklistPaths) {
|
|
||||||
// Exclude exact matches and paths starting with blacklisted paths
|
|
||||||
foreach ($blacklistPaths as $blacklistedPath) {
|
|
||||||
if ($path === $blacklistedPath || str_starts_with($path, $blacklistedPath.'/')) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return true;
|
|
||||||
})
|
|
||||||
->map(function ($path) {
|
|
||||||
// Convert relative paths to absolute URLs
|
|
||||||
return 'https://www.belganewsagency.eu'.$path;
|
|
||||||
})
|
|
||||||
->values()
|
|
||||||
->toArray();
|
|
||||||
|
|
||||||
return $urls;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
@ -1,37 +0,0 @@
|
||||||
<?php
|
|
||||||
|
|
||||||
namespace App\Services\Parsers;
|
|
||||||
|
|
||||||
use App\Contracts\HomepageParserInterface;
|
|
||||||
|
|
||||||
class BelgaHomepageParserAdapter implements HomepageParserInterface
|
|
||||||
{
|
|
||||||
public function __construct(
|
|
||||||
private readonly string $language = 'en',
|
|
||||||
) {}
|
|
||||||
|
|
||||||
public function getLanguage(): string
|
|
||||||
{
|
|
||||||
return $this->language;
|
|
||||||
}
|
|
||||||
|
|
||||||
public function canParse(string $url): bool
|
|
||||||
{
|
|
||||||
return str_contains($url, 'belganewsagency.eu');
|
|
||||||
}
|
|
||||||
|
|
||||||
public function extractArticleUrls(string $html): array
|
|
||||||
{
|
|
||||||
return BelgaHomepageParser::extractArticleUrls($html);
|
|
||||||
}
|
|
||||||
|
|
||||||
public function getHomepageUrl(): string
|
|
||||||
{
|
|
||||||
return 'https://www.belganewsagency.eu/';
|
|
||||||
}
|
|
||||||
|
|
||||||
public function getSourceName(): string
|
|
||||||
{
|
|
||||||
return 'Belga News Agency';
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
@ -33,13 +33,12 @@
|
||||||
'code' => 'belga',
|
'code' => 'belga',
|
||||||
'name' => 'Belga News Agency',
|
'name' => 'Belga News Agency',
|
||||||
'description' => 'Belgian national news agency',
|
'description' => 'Belgian national news agency',
|
||||||
'type' => 'website',
|
'type' => 'rss',
|
||||||
'is_active' => true,
|
'is_active' => true,
|
||||||
'languages' => [
|
'languages' => [
|
||||||
'en' => ['url' => 'https://www.belganewsagency.eu/'],
|
'en' => ['url' => 'https://www.belganewsagency.eu/feed'],
|
||||||
],
|
],
|
||||||
'parsers' => [
|
'parsers' => [
|
||||||
'homepage' => \App\Services\Parsers\BelgaHomepageParserAdapter::class,
|
|
||||||
'article' => \App\Services\Parsers\BelgaArticleParser::class,
|
'article' => \App\Services\Parsers\BelgaArticleParser::class,
|
||||||
'article_page' => \App\Services\Parsers\BelgaArticlePageParser::class,
|
'article_page' => \App\Services\Parsers\BelgaArticlePageParser::class,
|
||||||
],
|
],
|
||||||
|
|
|
||||||
|
|
@ -75,7 +75,8 @@ public function belga(): static
|
||||||
{
|
{
|
||||||
return $this->state(fn (array $attributes) => [
|
return $this->state(fn (array $attributes) => [
|
||||||
'provider' => 'belga',
|
'provider' => 'belga',
|
||||||
'url' => 'https://www.belganewsagency.eu/',
|
'url' => 'https://www.belganewsagency.eu/feed',
|
||||||
|
'type' => 'rss',
|
||||||
]);
|
]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -98,16 +98,16 @@ public function test_store_creates_belga_feed_successfully(): void
|
||||||
'message' => 'Feed created successfully!',
|
'message' => 'Feed created successfully!',
|
||||||
'data' => [
|
'data' => [
|
||||||
'name' => 'Belga Test Feed',
|
'name' => 'Belga Test Feed',
|
||||||
'url' => 'https://www.belganewsagency.eu/',
|
'url' => 'https://www.belganewsagency.eu/feed',
|
||||||
'type' => 'website',
|
'type' => 'rss',
|
||||||
'is_active' => true,
|
'is_active' => true,
|
||||||
],
|
],
|
||||||
]);
|
]);
|
||||||
|
|
||||||
$this->assertDatabaseHas('feeds', [
|
$this->assertDatabaseHas('feeds', [
|
||||||
'name' => 'Belga Test Feed',
|
'name' => 'Belga Test Feed',
|
||||||
'url' => 'https://www.belganewsagency.eu/',
|
'url' => 'https://www.belganewsagency.eu/feed',
|
||||||
'type' => 'website',
|
'type' => 'rss',
|
||||||
]);
|
]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -42,8 +42,8 @@ public function test_creates_belga_feed_with_correct_url(): void
|
||||||
|
|
||||||
$feed = $this->action->execute('Belga News', 'belga', $language->id);
|
$feed = $this->action->execute('Belga News', 'belga', $language->id);
|
||||||
|
|
||||||
$this->assertEquals('https://www.belganewsagency.eu/', $feed->url);
|
$this->assertEquals('https://www.belganewsagency.eu/feed', $feed->url);
|
||||||
$this->assertEquals('website', $feed->type);
|
$this->assertEquals('rss', $feed->type);
|
||||||
$this->assertEquals('belga', $feed->provider);
|
$this->assertEquals('belga', $feed->provider);
|
||||||
$this->assertNull($feed->description);
|
$this->assertNull($feed->description);
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -156,6 +156,52 @@ public function test_get_articles_from_rss_feed_handles_http_failure(): void
|
||||||
$this->assertEmpty($result);
|
$this->assertEmpty($result);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public function test_get_articles_from_belga_rss_feed_creates_articles(): void
|
||||||
|
{
|
||||||
|
$belgaRss = <<<'XML'
|
||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<rss version="2.0">
|
||||||
|
<channel>
|
||||||
|
<title>Belga News Agency</title>
|
||||||
|
<link>https://www.belganewsagency.eu</link>
|
||||||
|
<item>
|
||||||
|
<title>Belgium announces new climate plan</title>
|
||||||
|
<link>https://www.belganewsagency.eu/belgium-announces-new-climate-plan</link>
|
||||||
|
<description>Belgium has unveiled a comprehensive climate strategy.</description>
|
||||||
|
<pubDate>Sun, 08 Mar 2026 10:00:00 GMT</pubDate>
|
||||||
|
</item>
|
||||||
|
<item>
|
||||||
|
<title>EU summit concludes in Brussels</title>
|
||||||
|
<link>https://www.belganewsagency.eu/eu-summit-concludes-in-brussels</link>
|
||||||
|
<description>European leaders reached agreement on key issues.</description>
|
||||||
|
<pubDate>Sun, 08 Mar 2026 09:00:00 GMT</pubDate>
|
||||||
|
</item>
|
||||||
|
</channel>
|
||||||
|
</rss>
|
||||||
|
XML;
|
||||||
|
|
||||||
|
Http::fake(['*' => Http::response($belgaRss, 200)]);
|
||||||
|
|
||||||
|
$feed = Feed::factory()->create([
|
||||||
|
'type' => 'rss',
|
||||||
|
'provider' => 'belga',
|
||||||
|
'url' => 'https://www.belganewsagency.eu/feed',
|
||||||
|
]);
|
||||||
|
|
||||||
|
$fetcher = $this->createArticleFetcher();
|
||||||
|
$result = $fetcher->getArticlesFromFeed($feed);
|
||||||
|
|
||||||
|
$this->assertCount(2, $result);
|
||||||
|
$this->assertDatabaseHas('articles', [
|
||||||
|
'url' => 'https://www.belganewsagency.eu/belgium-announces-new-climate-plan',
|
||||||
|
'feed_id' => $feed->id,
|
||||||
|
]);
|
||||||
|
$this->assertDatabaseHas('articles', [
|
||||||
|
'url' => 'https://www.belganewsagency.eu/eu-summit-concludes-in-brussels',
|
||||||
|
'feed_id' => $feed->id,
|
||||||
|
]);
|
||||||
|
}
|
||||||
|
|
||||||
protected function tearDown(): void
|
protected function tearDown(): void
|
||||||
{
|
{
|
||||||
Mockery::close();
|
Mockery::close();
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue