Decouple source and fetching

This commit is contained in:
myrmidex 2025-06-29 21:33:18 +02:00
parent 97edb507f6
commit 788d649276
14 changed files with 366 additions and 76 deletions

View file

@ -0,0 +1,27 @@
<?php
namespace App\Console\Commands;
use App\Models\Article;
use App\Services\Article\ArticleFetcher;
use Illuminate\Console\Command;
class FetchArticleCommand extends Command
{
protected $signature = 'article:fetch {url}';
protected $description = 'Fetch article from url';
public function handle(): int
{
$article = Article::createQuietly([
'url' => $this->argument('url'),
]);
$res = ArticleFetcher::fetchArticleData($article);
dump($res);
return self::SUCCESS;
}
}

View file

@ -21,7 +21,7 @@ public function handle(): int
$this->info('Publishing article: ' . $article->url);
try {
LemmyPublisher::fromConfig()->publish($article, ArticleFetcher::fetchArticle($article));
LemmyPublisher::fromConfig()->publish($article, ArticleFetcher::fetchArticleData($article));
} catch (Exception) {
return self::FAILURE;
}

View file

@ -0,0 +1,21 @@
<?php
namespace App\Contracts;
interface ArticleParserInterface
{
/**
* Check if this parser can handle the given URL
*/
public function canParse(string $url): bool;
/**
* Extract article data from HTML
*/
public function extractData(string $html): array;
/**
* Get the source name for this parser
*/
public function getSourceName(): string;
}

View file

@ -0,0 +1,26 @@
<?php
namespace App\Contracts;
interface HomepageParserInterface
{
/**
* Check if this parser can handle the given homepage URL
*/
public function canParse(string $url): bool;
/**
* Extract article URLs from homepage HTML
*/
public function extractArticleUrls(string $html): array;
/**
* Get the homepage URL for this source
*/
public function getHomepageUrl(): string;
/**
* Get the source name for this parser
*/
public function getSourceName(): string;
}

View file

@ -18,6 +18,6 @@ public function handle(ArticleReadyToPublish $event): void
logger('Publishing article: ' . $article->id . ' : ' . $article->url);
LemmyPublisher::fromConfig()->publish($article, ArticleFetcher::fetchArticle($article));
LemmyPublisher::fromConfig()->publish($article, ArticleFetcher::fetchArticleData($article));
}
}

View file

@ -3,87 +3,49 @@
namespace App\Services\Article;
use App\Models\Article;
use App\Services\Http\HttpFetcher;
use App\Services\Factories\ArticleParserFactory;
use App\Services\Factories\HomepageParserFactory;
use Exception;
use Illuminate\Support\Collection;
use Illuminate\Support\Facades\Http;
class ArticleFetcher
{
public static function getNewArticles(): Collection
{
return self::fetchArticles()
try {
$allArticles = collect();
foreach (HomepageParserFactory::getAllParsers() as $parser) {
$html = HttpFetcher::fetchHtml($parser->getHomepageUrl());
$urls = $parser->extractArticleUrls($html);
$articles = collect($urls)
->map(fn (string $url) => self::saveArticle($url));
$allArticles = $allArticles->merge($articles);
}
public static function fetchArticle(Article $article): array
return $allArticles->filter();
} catch (Exception $e) {
logger()->error("Failed to get new articles", ['error' => $e->getMessage()]);
return new Collection([]);
}
}
public static function fetchArticleData(Article $article): array
{
try {
$response = Http::get($article->url);
if (!$response->successful()) {
logger()->error('Failed to fetch article', [
'url' => $article->url,
'status' => $response->status()
]);
return [];
}
$html = $response->body();
return ArticleDataExtractor::extractData($html);
$html = HttpFetcher::fetchHtml($article->url);
$parser = ArticleParserFactory::getParser($article->url);
return $parser->extractData($html);
} catch (Exception $e) {
logger()->error('Exception while fetching article', [
logger()->error('Exception while fetching article data', [
'url' => $article->url,
'error' => $e->getMessage()
]);
return $article;
}
}
private static function fetchArticles(): Collection
{
try {
$response = Http::get('https://www.vrt.be/vrtnws/en/');
$html = $response->body();
// Extract article links using regex
preg_match_all('/href="(\/vrtnws\/en\/\d{4}\/\d{2}\/\d{2}\/[^"]+)"/', $html, $matches);
$urls = collect($matches[1] ?? [])
->unique()
->map(fn ($path) => 'https://www.vrt.be' . $path)
->toArray();
$responses = Http::pool(function ($pool) use ($urls) {
foreach ($urls as $url) {
$pool->get($url);
}
});
return collect($responses)
->map(function ($response, $index) use ($urls) {
if (!isset($urls[$index])) {
return null;
}
$url = $urls[$index];
try {
if ($response->successful()) {
return $url;
} else {
return null;
}
} catch (Exception) {
return null;
}
})
->filter(fn($article) => !empty($article));
} catch (Exception $e) {
logger()->error("Failed to fetch VRT homepage", ['error' => $e->getMessage()]);
return new Collection([]);
return [];
}
}

View file

@ -10,11 +10,29 @@ public static function validate(Article $article): Article
{
logger('Checking keywords for article: ' . $article->id);
$articleData = ArticleFetcher::fetchArticleData($article);
$validationResult = self::validateByKeywords($articleData['full_article']);
$article->update([
'is_valid' => true,
'is_valid' => $validationResult,
'validated_at' => now(),
]);
return $article->refresh();
}
private static function validateByKeywords(string $full_article): bool
{
$keywords = [
'N-VA', 'Bart De Wever', 'Frank Vandenbroucke',
];
foreach ($keywords as $keyword) {
if (stripos($full_article, $keyword) !== false) {
return true;
}
}
return false;
}
}

View file

@ -0,0 +1,42 @@
<?php
namespace App\Services\Factories;
use App\Contracts\ArticleParserInterface;
use App\Services\Parsers\VrtArticleParser;
use Exception;
class ArticleParserFactory
{
private static array $parsers = [
VrtArticleParser::class,
];
public static function getParser(string $url): ArticleParserInterface
{
foreach (self::$parsers as $parserClass) {
$parser = new $parserClass();
if ($parser->canParse($url)) {
return $parser;
}
}
throw new Exception("No parser found for URL: {$url}");
}
public static function getSupportedSources(): array
{
return array_map(function($parserClass) {
$parser = new $parserClass();
return $parser->getSourceName();
}, self::$parsers);
}
public static function registerParser(string $parserClass): void
{
if (!in_array($parserClass, self::$parsers)) {
self::$parsers[] = $parserClass;
}
}
}

View file

@ -0,0 +1,47 @@
<?php
namespace App\Services\Factories;
use App\Contracts\HomepageParserInterface;
use App\Services\Parsers\VrtHomepageParserAdapter;
use Exception;
class HomepageParserFactory
{
private static array $parsers = [
VrtHomepageParserAdapter::class,
];
public static function getParser(string $url): HomepageParserInterface
{
foreach (self::$parsers as $parserClass) {
$parser = new $parserClass();
if ($parser->canParse($url)) {
return $parser;
}
}
throw new Exception("No homepage parser found for URL: {$url}");
}
public static function getAllParsers(): array
{
return array_map(fn($parserClass) => new $parserClass(), self::$parsers);
}
public static function getSupportedSources(): array
{
return array_map(function($parserClass) {
$parser = new $parserClass();
return $parser->getSourceName();
}, self::$parsers);
}
public static function registerParser(string $parserClass): void
{
if (!in_array($parserClass, self::$parsers)) {
self::$parsers[] = $parserClass;
}
}
}

View file

@ -0,0 +1,77 @@
<?php
namespace App\Services\Http;
use Illuminate\Support\Facades\Http;
use Exception;
class HttpFetcher
{
public static function fetchHtml(string $url): string
{
try {
$response = Http::get($url);
if (!$response->successful()) {
throw new Exception("Failed to fetch URL: {$url} - Status: {$response->status()}");
}
return $response->body();
} catch (Exception $e) {
logger()->error('HTTP fetch failed', [
'url' => $url,
'error' => $e->getMessage()
]);
throw $e;
}
}
public static function fetchMultipleUrls(array $urls): array
{
try {
$responses = Http::pool(function ($pool) use ($urls) {
foreach ($urls as $url) {
$pool->get($url);
}
});
return collect($responses)
->map(function ($response, $index) use ($urls) {
if (!isset($urls[$index])) {
return null;
}
$url = $urls[$index];
try {
if ($response->successful()) {
return [
'url' => $url,
'html' => $response->body(),
'success' => true
];
} else {
return [
'url' => $url,
'html' => null,
'success' => false,
'status' => $response->status()
];
}
} catch (Exception) {
return [
'url' => $url,
'html' => null,
'success' => false,
'error' => 'Exception occurred'
];
}
})
->filter(fn($result) => $result !== null)
->toArray();
} catch (Exception $e) {
logger()->error('Multiple URL fetch failed', ['error' => $e->getMessage()]);
return [];
}
}
}

View file

@ -1,8 +1,8 @@
<?php
namespace App\Services\Article;
namespace App\Services\Parsers;
class ArticleDataExtractor
class VrtArticlePageParser
{
public static function extractTitle(string $html): ?string
{
@ -39,15 +39,6 @@ public static function extractDescription(string $html): ?string
return null;
}
public static function extractData(string $html): array
{
return [
'title' => self::extractTitle($html),
'description' => self::extractDescription($html),
'full_article' => self::extractFullArticle($html),
];
}
public static function extractFullArticle(string $html): ?string
{
// Remove scripts, styles, and other non-content elements
@ -72,4 +63,13 @@ public static function extractFullArticle(string $html): ?string
return null;
}
public static function extractData(string $html): array
{
return [
'title' => self::extractTitle($html),
'description' => self::extractDescription($html),
'full_article' => self::extractFullArticle($html),
];
}
}

View file

@ -0,0 +1,23 @@
<?php
namespace App\Services\Parsers;
use App\Contracts\ArticleParserInterface;
class VrtArticleParser implements ArticleParserInterface
{
public function canParse(string $url): bool
{
return str_contains($url, 'vrt.be');
}
public function extractData(string $html): array
{
return VrtArticlePageParser::extractData($html);
}
public function getSourceName(): string
{
return 'VRT News';
}
}

View file

@ -0,0 +1,19 @@
<?php
namespace App\Services\Parsers;
class VrtHomepageParser
{
public static function extractArticleUrls(string $html): array
{
// Extract article links using regex
preg_match_all('/href="(\/vrtnws\/en\/\d{4}\/\d{2}\/\d{2}\/[^"]+)"/', $html, $matches);
$urls = collect($matches[1] ?? [])
->unique()
->map(fn ($path) => 'https://www.vrt.be' . $path)
->toArray();
return $urls;
}
}

View file

@ -0,0 +1,28 @@
<?php
namespace App\Services\Parsers;
use App\Contracts\HomepageParserInterface;
class VrtHomepageParserAdapter implements HomepageParserInterface
{
public function canParse(string $url): bool
{
return str_contains($url, 'vrt.be');
}
public function extractArticleUrls(string $html): array
{
return VrtHomepageParser::extractArticleUrls($html);
}
public function getHomepageUrl(): string
{
return 'https://www.vrt.be/vrtnws/en/';
}
public function getSourceName(): string
{
return 'VRT News';
}
}