Decouple source and fetching
This commit is contained in:
parent
97edb507f6
commit
788d649276
14 changed files with 366 additions and 76 deletions
27
app/Console/Commands/FetchArticleCommand.php
Normal file
27
app/Console/Commands/FetchArticleCommand.php
Normal file
|
|
@ -0,0 +1,27 @@
|
|||
<?php
|
||||
|
||||
namespace App\Console\Commands;
|
||||
|
||||
use App\Models\Article;
|
||||
use App\Services\Article\ArticleFetcher;
|
||||
use Illuminate\Console\Command;
|
||||
|
||||
class FetchArticleCommand extends Command
|
||||
{
|
||||
protected $signature = 'article:fetch {url}';
|
||||
|
||||
protected $description = 'Fetch article from url';
|
||||
|
||||
public function handle(): int
|
||||
{
|
||||
$article = Article::createQuietly([
|
||||
'url' => $this->argument('url'),
|
||||
]);
|
||||
|
||||
$res = ArticleFetcher::fetchArticleData($article);
|
||||
|
||||
dump($res);
|
||||
|
||||
return self::SUCCESS;
|
||||
}
|
||||
}
|
||||
|
|
@ -21,7 +21,7 @@ public function handle(): int
|
|||
$this->info('Publishing article: ' . $article->url);
|
||||
|
||||
try {
|
||||
LemmyPublisher::fromConfig()->publish($article, ArticleFetcher::fetchArticle($article));
|
||||
LemmyPublisher::fromConfig()->publish($article, ArticleFetcher::fetchArticleData($article));
|
||||
} catch (Exception) {
|
||||
return self::FAILURE;
|
||||
}
|
||||
|
|
|
|||
21
app/Contracts/ArticleParserInterface.php
Normal file
21
app/Contracts/ArticleParserInterface.php
Normal file
|
|
@ -0,0 +1,21 @@
|
|||
<?php
|
||||
|
||||
namespace App\Contracts;
|
||||
|
||||
interface ArticleParserInterface
|
||||
{
|
||||
/**
|
||||
* Check if this parser can handle the given URL
|
||||
*/
|
||||
public function canParse(string $url): bool;
|
||||
|
||||
/**
|
||||
* Extract article data from HTML
|
||||
*/
|
||||
public function extractData(string $html): array;
|
||||
|
||||
/**
|
||||
* Get the source name for this parser
|
||||
*/
|
||||
public function getSourceName(): string;
|
||||
}
|
||||
26
app/Contracts/HomepageParserInterface.php
Normal file
26
app/Contracts/HomepageParserInterface.php
Normal file
|
|
@ -0,0 +1,26 @@
|
|||
<?php
|
||||
|
||||
namespace App\Contracts;
|
||||
|
||||
interface HomepageParserInterface
|
||||
{
|
||||
/**
|
||||
* Check if this parser can handle the given homepage URL
|
||||
*/
|
||||
public function canParse(string $url): bool;
|
||||
|
||||
/**
|
||||
* Extract article URLs from homepage HTML
|
||||
*/
|
||||
public function extractArticleUrls(string $html): array;
|
||||
|
||||
/**
|
||||
* Get the homepage URL for this source
|
||||
*/
|
||||
public function getHomepageUrl(): string;
|
||||
|
||||
/**
|
||||
* Get the source name for this parser
|
||||
*/
|
||||
public function getSourceName(): string;
|
||||
}
|
||||
|
|
@ -18,6 +18,6 @@ public function handle(ArticleReadyToPublish $event): void
|
|||
|
||||
logger('Publishing article: ' . $article->id . ' : ' . $article->url);
|
||||
|
||||
LemmyPublisher::fromConfig()->publish($article, ArticleFetcher::fetchArticle($article));
|
||||
LemmyPublisher::fromConfig()->publish($article, ArticleFetcher::fetchArticleData($article));
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -3,87 +3,49 @@
|
|||
namespace App\Services\Article;
|
||||
|
||||
use App\Models\Article;
|
||||
use App\Services\Http\HttpFetcher;
|
||||
use App\Services\Factories\ArticleParserFactory;
|
||||
use App\Services\Factories\HomepageParserFactory;
|
||||
use Exception;
|
||||
use Illuminate\Support\Collection;
|
||||
use Illuminate\Support\Facades\Http;
|
||||
|
||||
class ArticleFetcher
|
||||
{
|
||||
public static function getNewArticles(): Collection
|
||||
{
|
||||
return self::fetchArticles()
|
||||
->map(fn (string $url) => self::saveArticle($url));
|
||||
}
|
||||
|
||||
public static function fetchArticle(Article $article): array
|
||||
{
|
||||
try {
|
||||
$response = Http::get($article->url);
|
||||
$allArticles = collect();
|
||||
|
||||
if (!$response->successful()) {
|
||||
logger()->error('Failed to fetch article', [
|
||||
'url' => $article->url,
|
||||
'status' => $response->status()
|
||||
]);
|
||||
return [];
|
||||
foreach (HomepageParserFactory::getAllParsers() as $parser) {
|
||||
$html = HttpFetcher::fetchHtml($parser->getHomepageUrl());
|
||||
$urls = $parser->extractArticleUrls($html);
|
||||
|
||||
$articles = collect($urls)
|
||||
->map(fn (string $url) => self::saveArticle($url));
|
||||
|
||||
$allArticles = $allArticles->merge($articles);
|
||||
}
|
||||
|
||||
$html = $response->body();
|
||||
|
||||
return ArticleDataExtractor::extractData($html);
|
||||
|
||||
return $allArticles->filter();
|
||||
} catch (Exception $e) {
|
||||
logger()->error('Exception while fetching article', [
|
||||
'url' => $article->url,
|
||||
'error' => $e->getMessage()
|
||||
]);
|
||||
return $article;
|
||||
logger()->error("Failed to get new articles", ['error' => $e->getMessage()]);
|
||||
return new Collection([]);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private static function fetchArticles(): Collection
|
||||
public static function fetchArticleData(Article $article): array
|
||||
{
|
||||
try {
|
||||
$response = Http::get('https://www.vrt.be/vrtnws/en/');
|
||||
$html = $response->body();
|
||||
$html = HttpFetcher::fetchHtml($article->url);
|
||||
$parser = ArticleParserFactory::getParser($article->url);
|
||||
|
||||
// Extract article links using regex
|
||||
preg_match_all('/href="(\/vrtnws\/en\/\d{4}\/\d{2}\/\d{2}\/[^"]+)"/', $html, $matches);
|
||||
|
||||
$urls = collect($matches[1] ?? [])
|
||||
->unique()
|
||||
->map(fn ($path) => 'https://www.vrt.be' . $path)
|
||||
->toArray();
|
||||
|
||||
$responses = Http::pool(function ($pool) use ($urls) {
|
||||
foreach ($urls as $url) {
|
||||
$pool->get($url);
|
||||
}
|
||||
});
|
||||
|
||||
return collect($responses)
|
||||
->map(function ($response, $index) use ($urls) {
|
||||
if (!isset($urls[$index])) {
|
||||
return null;
|
||||
}
|
||||
|
||||
$url = $urls[$index];
|
||||
|
||||
try {
|
||||
if ($response->successful()) {
|
||||
return $url;
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
} catch (Exception) {
|
||||
return null;
|
||||
}
|
||||
})
|
||||
->filter(fn($article) => !empty($article));
|
||||
return $parser->extractData($html);
|
||||
} catch (Exception $e) {
|
||||
logger()->error("Failed to fetch VRT homepage", ['error' => $e->getMessage()]);
|
||||
return new Collection([]);
|
||||
logger()->error('Exception while fetching article data', [
|
||||
'url' => $article->url,
|
||||
'error' => $e->getMessage()
|
||||
]);
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -10,11 +10,29 @@ public static function validate(Article $article): Article
|
|||
{
|
||||
logger('Checking keywords for article: ' . $article->id);
|
||||
|
||||
$articleData = ArticleFetcher::fetchArticleData($article);
|
||||
$validationResult = self::validateByKeywords($articleData['full_article']);
|
||||
|
||||
$article->update([
|
||||
'is_valid' => true,
|
||||
'is_valid' => $validationResult,
|
||||
'validated_at' => now(),
|
||||
]);
|
||||
|
||||
return $article->refresh();
|
||||
}
|
||||
|
||||
private static function validateByKeywords(string $full_article): bool
|
||||
{
|
||||
$keywords = [
|
||||
'N-VA', 'Bart De Wever', 'Frank Vandenbroucke',
|
||||
];
|
||||
|
||||
foreach ($keywords as $keyword) {
|
||||
if (stripos($full_article, $keyword) !== false) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
42
app/Services/Factories/ArticleParserFactory.php
Normal file
42
app/Services/Factories/ArticleParserFactory.php
Normal file
|
|
@ -0,0 +1,42 @@
|
|||
<?php
|
||||
|
||||
namespace App\Services\Factories;
|
||||
|
||||
use App\Contracts\ArticleParserInterface;
|
||||
use App\Services\Parsers\VrtArticleParser;
|
||||
use Exception;
|
||||
|
||||
class ArticleParserFactory
|
||||
{
|
||||
private static array $parsers = [
|
||||
VrtArticleParser::class,
|
||||
];
|
||||
|
||||
public static function getParser(string $url): ArticleParserInterface
|
||||
{
|
||||
foreach (self::$parsers as $parserClass) {
|
||||
$parser = new $parserClass();
|
||||
|
||||
if ($parser->canParse($url)) {
|
||||
return $parser;
|
||||
}
|
||||
}
|
||||
|
||||
throw new Exception("No parser found for URL: {$url}");
|
||||
}
|
||||
|
||||
public static function getSupportedSources(): array
|
||||
{
|
||||
return array_map(function($parserClass) {
|
||||
$parser = new $parserClass();
|
||||
return $parser->getSourceName();
|
||||
}, self::$parsers);
|
||||
}
|
||||
|
||||
public static function registerParser(string $parserClass): void
|
||||
{
|
||||
if (!in_array($parserClass, self::$parsers)) {
|
||||
self::$parsers[] = $parserClass;
|
||||
}
|
||||
}
|
||||
}
|
||||
47
app/Services/Factories/HomepageParserFactory.php
Normal file
47
app/Services/Factories/HomepageParserFactory.php
Normal file
|
|
@ -0,0 +1,47 @@
|
|||
<?php
|
||||
|
||||
namespace App\Services\Factories;
|
||||
|
||||
use App\Contracts\HomepageParserInterface;
|
||||
use App\Services\Parsers\VrtHomepageParserAdapter;
|
||||
use Exception;
|
||||
|
||||
class HomepageParserFactory
|
||||
{
|
||||
private static array $parsers = [
|
||||
VrtHomepageParserAdapter::class,
|
||||
];
|
||||
|
||||
public static function getParser(string $url): HomepageParserInterface
|
||||
{
|
||||
foreach (self::$parsers as $parserClass) {
|
||||
$parser = new $parserClass();
|
||||
|
||||
if ($parser->canParse($url)) {
|
||||
return $parser;
|
||||
}
|
||||
}
|
||||
|
||||
throw new Exception("No homepage parser found for URL: {$url}");
|
||||
}
|
||||
|
||||
public static function getAllParsers(): array
|
||||
{
|
||||
return array_map(fn($parserClass) => new $parserClass(), self::$parsers);
|
||||
}
|
||||
|
||||
public static function getSupportedSources(): array
|
||||
{
|
||||
return array_map(function($parserClass) {
|
||||
$parser = new $parserClass();
|
||||
return $parser->getSourceName();
|
||||
}, self::$parsers);
|
||||
}
|
||||
|
||||
public static function registerParser(string $parserClass): void
|
||||
{
|
||||
if (!in_array($parserClass, self::$parsers)) {
|
||||
self::$parsers[] = $parserClass;
|
||||
}
|
||||
}
|
||||
}
|
||||
77
app/Services/Http/HttpFetcher.php
Normal file
77
app/Services/Http/HttpFetcher.php
Normal file
|
|
@ -0,0 +1,77 @@
|
|||
<?php
|
||||
|
||||
namespace App\Services\Http;
|
||||
|
||||
use Illuminate\Support\Facades\Http;
|
||||
use Exception;
|
||||
|
||||
class HttpFetcher
|
||||
{
|
||||
public static function fetchHtml(string $url): string
|
||||
{
|
||||
try {
|
||||
$response = Http::get($url);
|
||||
|
||||
if (!$response->successful()) {
|
||||
throw new Exception("Failed to fetch URL: {$url} - Status: {$response->status()}");
|
||||
}
|
||||
|
||||
return $response->body();
|
||||
} catch (Exception $e) {
|
||||
logger()->error('HTTP fetch failed', [
|
||||
'url' => $url,
|
||||
'error' => $e->getMessage()
|
||||
]);
|
||||
throw $e;
|
||||
}
|
||||
}
|
||||
|
||||
public static function fetchMultipleUrls(array $urls): array
|
||||
{
|
||||
try {
|
||||
$responses = Http::pool(function ($pool) use ($urls) {
|
||||
foreach ($urls as $url) {
|
||||
$pool->get($url);
|
||||
}
|
||||
});
|
||||
|
||||
return collect($responses)
|
||||
->map(function ($response, $index) use ($urls) {
|
||||
if (!isset($urls[$index])) {
|
||||
return null;
|
||||
}
|
||||
|
||||
$url = $urls[$index];
|
||||
|
||||
try {
|
||||
if ($response->successful()) {
|
||||
return [
|
||||
'url' => $url,
|
||||
'html' => $response->body(),
|
||||
'success' => true
|
||||
];
|
||||
} else {
|
||||
return [
|
||||
'url' => $url,
|
||||
'html' => null,
|
||||
'success' => false,
|
||||
'status' => $response->status()
|
||||
];
|
||||
}
|
||||
} catch (Exception) {
|
||||
return [
|
||||
'url' => $url,
|
||||
'html' => null,
|
||||
'success' => false,
|
||||
'error' => 'Exception occurred'
|
||||
];
|
||||
}
|
||||
})
|
||||
->filter(fn($result) => $result !== null)
|
||||
->toArray();
|
||||
} catch (Exception $e) {
|
||||
logger()->error('Multiple URL fetch failed', ['error' => $e->getMessage()]);
|
||||
return [];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1,8 +1,8 @@
|
|||
<?php
|
||||
|
||||
namespace App\Services\Article;
|
||||
namespace App\Services\Parsers;
|
||||
|
||||
class ArticleDataExtractor
|
||||
class VrtArticlePageParser
|
||||
{
|
||||
public static function extractTitle(string $html): ?string
|
||||
{
|
||||
|
|
@ -39,15 +39,6 @@ public static function extractDescription(string $html): ?string
|
|||
return null;
|
||||
}
|
||||
|
||||
public static function extractData(string $html): array
|
||||
{
|
||||
return [
|
||||
'title' => self::extractTitle($html),
|
||||
'description' => self::extractDescription($html),
|
||||
'full_article' => self::extractFullArticle($html),
|
||||
];
|
||||
}
|
||||
|
||||
public static function extractFullArticle(string $html): ?string
|
||||
{
|
||||
// Remove scripts, styles, and other non-content elements
|
||||
|
|
@ -72,4 +63,13 @@ public static function extractFullArticle(string $html): ?string
|
|||
|
||||
return null;
|
||||
}
|
||||
|
||||
public static function extractData(string $html): array
|
||||
{
|
||||
return [
|
||||
'title' => self::extractTitle($html),
|
||||
'description' => self::extractDescription($html),
|
||||
'full_article' => self::extractFullArticle($html),
|
||||
];
|
||||
}
|
||||
}
|
||||
23
app/Services/Parsers/VrtArticleParser.php
Normal file
23
app/Services/Parsers/VrtArticleParser.php
Normal file
|
|
@ -0,0 +1,23 @@
|
|||
<?php
|
||||
|
||||
namespace App\Services\Parsers;
|
||||
|
||||
use App\Contracts\ArticleParserInterface;
|
||||
|
||||
class VrtArticleParser implements ArticleParserInterface
|
||||
{
|
||||
public function canParse(string $url): bool
|
||||
{
|
||||
return str_contains($url, 'vrt.be');
|
||||
}
|
||||
|
||||
public function extractData(string $html): array
|
||||
{
|
||||
return VrtArticlePageParser::extractData($html);
|
||||
}
|
||||
|
||||
public function getSourceName(): string
|
||||
{
|
||||
return 'VRT News';
|
||||
}
|
||||
}
|
||||
19
app/Services/Parsers/VrtHomepageParser.php
Normal file
19
app/Services/Parsers/VrtHomepageParser.php
Normal file
|
|
@ -0,0 +1,19 @@
|
|||
<?php
|
||||
|
||||
namespace App\Services\Parsers;
|
||||
|
||||
class VrtHomepageParser
|
||||
{
|
||||
public static function extractArticleUrls(string $html): array
|
||||
{
|
||||
// Extract article links using regex
|
||||
preg_match_all('/href="(\/vrtnws\/en\/\d{4}\/\d{2}\/\d{2}\/[^"]+)"/', $html, $matches);
|
||||
|
||||
$urls = collect($matches[1] ?? [])
|
||||
->unique()
|
||||
->map(fn ($path) => 'https://www.vrt.be' . $path)
|
||||
->toArray();
|
||||
|
||||
return $urls;
|
||||
}
|
||||
}
|
||||
28
app/Services/Parsers/VrtHomepageParserAdapter.php
Normal file
28
app/Services/Parsers/VrtHomepageParserAdapter.php
Normal file
|
|
@ -0,0 +1,28 @@
|
|||
<?php
|
||||
|
||||
namespace App\Services\Parsers;
|
||||
|
||||
use App\Contracts\HomepageParserInterface;
|
||||
|
||||
class VrtHomepageParserAdapter implements HomepageParserInterface
|
||||
{
|
||||
public function canParse(string $url): bool
|
||||
{
|
||||
return str_contains($url, 'vrt.be');
|
||||
}
|
||||
|
||||
public function extractArticleUrls(string $html): array
|
||||
{
|
||||
return VrtHomepageParser::extractArticleUrls($html);
|
||||
}
|
||||
|
||||
public function getHomepageUrl(): string
|
||||
{
|
||||
return 'https://www.vrt.be/vrtnws/en/';
|
||||
}
|
||||
|
||||
public function getSourceName(): string
|
||||
{
|
||||
return 'VRT News';
|
||||
}
|
||||
}
|
||||
Loading…
Reference in a new issue