Decouple source and fetching
This commit is contained in:
parent
97edb507f6
commit
788d649276
14 changed files with 366 additions and 76 deletions
27
app/Console/Commands/FetchArticleCommand.php
Normal file
27
app/Console/Commands/FetchArticleCommand.php
Normal file
|
|
@ -0,0 +1,27 @@
|
||||||
|
<?php
|
||||||
|
|
||||||
|
namespace App\Console\Commands;
|
||||||
|
|
||||||
|
use App\Models\Article;
|
||||||
|
use App\Services\Article\ArticleFetcher;
|
||||||
|
use Illuminate\Console\Command;
|
||||||
|
|
||||||
|
class FetchArticleCommand extends Command
|
||||||
|
{
|
||||||
|
protected $signature = 'article:fetch {url}';
|
||||||
|
|
||||||
|
protected $description = 'Fetch article from url';
|
||||||
|
|
||||||
|
public function handle(): int
|
||||||
|
{
|
||||||
|
$article = Article::createQuietly([
|
||||||
|
'url' => $this->argument('url'),
|
||||||
|
]);
|
||||||
|
|
||||||
|
$res = ArticleFetcher::fetchArticleData($article);
|
||||||
|
|
||||||
|
dump($res);
|
||||||
|
|
||||||
|
return self::SUCCESS;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -21,7 +21,7 @@ public function handle(): int
|
||||||
$this->info('Publishing article: ' . $article->url);
|
$this->info('Publishing article: ' . $article->url);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
LemmyPublisher::fromConfig()->publish($article, ArticleFetcher::fetchArticle($article));
|
LemmyPublisher::fromConfig()->publish($article, ArticleFetcher::fetchArticleData($article));
|
||||||
} catch (Exception) {
|
} catch (Exception) {
|
||||||
return self::FAILURE;
|
return self::FAILURE;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
21
app/Contracts/ArticleParserInterface.php
Normal file
21
app/Contracts/ArticleParserInterface.php
Normal file
|
|
@ -0,0 +1,21 @@
|
||||||
|
<?php
|
||||||
|
|
||||||
|
namespace App\Contracts;
|
||||||
|
|
||||||
|
interface ArticleParserInterface
|
||||||
|
{
|
||||||
|
/**
|
||||||
|
* Check if this parser can handle the given URL
|
||||||
|
*/
|
||||||
|
public function canParse(string $url): bool;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extract article data from HTML
|
||||||
|
*/
|
||||||
|
public function extractData(string $html): array;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get the source name for this parser
|
||||||
|
*/
|
||||||
|
public function getSourceName(): string;
|
||||||
|
}
|
||||||
26
app/Contracts/HomepageParserInterface.php
Normal file
26
app/Contracts/HomepageParserInterface.php
Normal file
|
|
@ -0,0 +1,26 @@
|
||||||
|
<?php
|
||||||
|
|
||||||
|
namespace App\Contracts;
|
||||||
|
|
||||||
|
interface HomepageParserInterface
|
||||||
|
{
|
||||||
|
/**
|
||||||
|
* Check if this parser can handle the given homepage URL
|
||||||
|
*/
|
||||||
|
public function canParse(string $url): bool;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extract article URLs from homepage HTML
|
||||||
|
*/
|
||||||
|
public function extractArticleUrls(string $html): array;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get the homepage URL for this source
|
||||||
|
*/
|
||||||
|
public function getHomepageUrl(): string;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get the source name for this parser
|
||||||
|
*/
|
||||||
|
public function getSourceName(): string;
|
||||||
|
}
|
||||||
|
|
@ -18,6 +18,6 @@ public function handle(ArticleReadyToPublish $event): void
|
||||||
|
|
||||||
logger('Publishing article: ' . $article->id . ' : ' . $article->url);
|
logger('Publishing article: ' . $article->id . ' : ' . $article->url);
|
||||||
|
|
||||||
LemmyPublisher::fromConfig()->publish($article, ArticleFetcher::fetchArticle($article));
|
LemmyPublisher::fromConfig()->publish($article, ArticleFetcher::fetchArticleData($article));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -3,87 +3,49 @@
|
||||||
namespace App\Services\Article;
|
namespace App\Services\Article;
|
||||||
|
|
||||||
use App\Models\Article;
|
use App\Models\Article;
|
||||||
|
use App\Services\Http\HttpFetcher;
|
||||||
|
use App\Services\Factories\ArticleParserFactory;
|
||||||
|
use App\Services\Factories\HomepageParserFactory;
|
||||||
use Exception;
|
use Exception;
|
||||||
use Illuminate\Support\Collection;
|
use Illuminate\Support\Collection;
|
||||||
use Illuminate\Support\Facades\Http;
|
|
||||||
|
|
||||||
class ArticleFetcher
|
class ArticleFetcher
|
||||||
{
|
{
|
||||||
public static function getNewArticles(): Collection
|
public static function getNewArticles(): Collection
|
||||||
{
|
{
|
||||||
return self::fetchArticles()
|
try {
|
||||||
|
$allArticles = collect();
|
||||||
|
|
||||||
|
foreach (HomepageParserFactory::getAllParsers() as $parser) {
|
||||||
|
$html = HttpFetcher::fetchHtml($parser->getHomepageUrl());
|
||||||
|
$urls = $parser->extractArticleUrls($html);
|
||||||
|
|
||||||
|
$articles = collect($urls)
|
||||||
->map(fn (string $url) => self::saveArticle($url));
|
->map(fn (string $url) => self::saveArticle($url));
|
||||||
|
|
||||||
|
$allArticles = $allArticles->merge($articles);
|
||||||
}
|
}
|
||||||
|
|
||||||
public static function fetchArticle(Article $article): array
|
return $allArticles->filter();
|
||||||
|
} catch (Exception $e) {
|
||||||
|
logger()->error("Failed to get new articles", ['error' => $e->getMessage()]);
|
||||||
|
return new Collection([]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public static function fetchArticleData(Article $article): array
|
||||||
{
|
{
|
||||||
try {
|
try {
|
||||||
$response = Http::get($article->url);
|
$html = HttpFetcher::fetchHtml($article->url);
|
||||||
|
$parser = ArticleParserFactory::getParser($article->url);
|
||||||
if (!$response->successful()) {
|
|
||||||
logger()->error('Failed to fetch article', [
|
|
||||||
'url' => $article->url,
|
|
||||||
'status' => $response->status()
|
|
||||||
]);
|
|
||||||
return [];
|
|
||||||
}
|
|
||||||
|
|
||||||
$html = $response->body();
|
|
||||||
|
|
||||||
return ArticleDataExtractor::extractData($html);
|
|
||||||
|
|
||||||
|
return $parser->extractData($html);
|
||||||
} catch (Exception $e) {
|
} catch (Exception $e) {
|
||||||
logger()->error('Exception while fetching article', [
|
logger()->error('Exception while fetching article data', [
|
||||||
'url' => $article->url,
|
'url' => $article->url,
|
||||||
'error' => $e->getMessage()
|
'error' => $e->getMessage()
|
||||||
]);
|
]);
|
||||||
return $article;
|
return [];
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private static function fetchArticles(): Collection
|
|
||||||
{
|
|
||||||
try {
|
|
||||||
$response = Http::get('https://www.vrt.be/vrtnws/en/');
|
|
||||||
$html = $response->body();
|
|
||||||
|
|
||||||
// Extract article links using regex
|
|
||||||
preg_match_all('/href="(\/vrtnws\/en\/\d{4}\/\d{2}\/\d{2}\/[^"]+)"/', $html, $matches);
|
|
||||||
|
|
||||||
$urls = collect($matches[1] ?? [])
|
|
||||||
->unique()
|
|
||||||
->map(fn ($path) => 'https://www.vrt.be' . $path)
|
|
||||||
->toArray();
|
|
||||||
|
|
||||||
$responses = Http::pool(function ($pool) use ($urls) {
|
|
||||||
foreach ($urls as $url) {
|
|
||||||
$pool->get($url);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
return collect($responses)
|
|
||||||
->map(function ($response, $index) use ($urls) {
|
|
||||||
if (!isset($urls[$index])) {
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
$url = $urls[$index];
|
|
||||||
|
|
||||||
try {
|
|
||||||
if ($response->successful()) {
|
|
||||||
return $url;
|
|
||||||
} else {
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
} catch (Exception) {
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
})
|
|
||||||
->filter(fn($article) => !empty($article));
|
|
||||||
} catch (Exception $e) {
|
|
||||||
logger()->error("Failed to fetch VRT homepage", ['error' => $e->getMessage()]);
|
|
||||||
return new Collection([]);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -10,11 +10,29 @@ public static function validate(Article $article): Article
|
||||||
{
|
{
|
||||||
logger('Checking keywords for article: ' . $article->id);
|
logger('Checking keywords for article: ' . $article->id);
|
||||||
|
|
||||||
|
$articleData = ArticleFetcher::fetchArticleData($article);
|
||||||
|
$validationResult = self::validateByKeywords($articleData['full_article']);
|
||||||
|
|
||||||
$article->update([
|
$article->update([
|
||||||
'is_valid' => true,
|
'is_valid' => $validationResult,
|
||||||
'validated_at' => now(),
|
'validated_at' => now(),
|
||||||
]);
|
]);
|
||||||
|
|
||||||
return $article->refresh();
|
return $article->refresh();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static function validateByKeywords(string $full_article): bool
|
||||||
|
{
|
||||||
|
$keywords = [
|
||||||
|
'N-VA', 'Bart De Wever', 'Frank Vandenbroucke',
|
||||||
|
];
|
||||||
|
|
||||||
|
foreach ($keywords as $keyword) {
|
||||||
|
if (stripos($full_article, $keyword) !== false) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
42
app/Services/Factories/ArticleParserFactory.php
Normal file
42
app/Services/Factories/ArticleParserFactory.php
Normal file
|
|
@ -0,0 +1,42 @@
|
||||||
|
<?php
|
||||||
|
|
||||||
|
namespace App\Services\Factories;
|
||||||
|
|
||||||
|
use App\Contracts\ArticleParserInterface;
|
||||||
|
use App\Services\Parsers\VrtArticleParser;
|
||||||
|
use Exception;
|
||||||
|
|
||||||
|
class ArticleParserFactory
|
||||||
|
{
|
||||||
|
private static array $parsers = [
|
||||||
|
VrtArticleParser::class,
|
||||||
|
];
|
||||||
|
|
||||||
|
public static function getParser(string $url): ArticleParserInterface
|
||||||
|
{
|
||||||
|
foreach (self::$parsers as $parserClass) {
|
||||||
|
$parser = new $parserClass();
|
||||||
|
|
||||||
|
if ($parser->canParse($url)) {
|
||||||
|
return $parser;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
throw new Exception("No parser found for URL: {$url}");
|
||||||
|
}
|
||||||
|
|
||||||
|
public static function getSupportedSources(): array
|
||||||
|
{
|
||||||
|
return array_map(function($parserClass) {
|
||||||
|
$parser = new $parserClass();
|
||||||
|
return $parser->getSourceName();
|
||||||
|
}, self::$parsers);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static function registerParser(string $parserClass): void
|
||||||
|
{
|
||||||
|
if (!in_array($parserClass, self::$parsers)) {
|
||||||
|
self::$parsers[] = $parserClass;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
47
app/Services/Factories/HomepageParserFactory.php
Normal file
47
app/Services/Factories/HomepageParserFactory.php
Normal file
|
|
@ -0,0 +1,47 @@
|
||||||
|
<?php
|
||||||
|
|
||||||
|
namespace App\Services\Factories;
|
||||||
|
|
||||||
|
use App\Contracts\HomepageParserInterface;
|
||||||
|
use App\Services\Parsers\VrtHomepageParserAdapter;
|
||||||
|
use Exception;
|
||||||
|
|
||||||
|
class HomepageParserFactory
|
||||||
|
{
|
||||||
|
private static array $parsers = [
|
||||||
|
VrtHomepageParserAdapter::class,
|
||||||
|
];
|
||||||
|
|
||||||
|
public static function getParser(string $url): HomepageParserInterface
|
||||||
|
{
|
||||||
|
foreach (self::$parsers as $parserClass) {
|
||||||
|
$parser = new $parserClass();
|
||||||
|
|
||||||
|
if ($parser->canParse($url)) {
|
||||||
|
return $parser;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
throw new Exception("No homepage parser found for URL: {$url}");
|
||||||
|
}
|
||||||
|
|
||||||
|
public static function getAllParsers(): array
|
||||||
|
{
|
||||||
|
return array_map(fn($parserClass) => new $parserClass(), self::$parsers);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static function getSupportedSources(): array
|
||||||
|
{
|
||||||
|
return array_map(function($parserClass) {
|
||||||
|
$parser = new $parserClass();
|
||||||
|
return $parser->getSourceName();
|
||||||
|
}, self::$parsers);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static function registerParser(string $parserClass): void
|
||||||
|
{
|
||||||
|
if (!in_array($parserClass, self::$parsers)) {
|
||||||
|
self::$parsers[] = $parserClass;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
77
app/Services/Http/HttpFetcher.php
Normal file
77
app/Services/Http/HttpFetcher.php
Normal file
|
|
@ -0,0 +1,77 @@
|
||||||
|
<?php
|
||||||
|
|
||||||
|
namespace App\Services\Http;
|
||||||
|
|
||||||
|
use Illuminate\Support\Facades\Http;
|
||||||
|
use Exception;
|
||||||
|
|
||||||
|
class HttpFetcher
|
||||||
|
{
|
||||||
|
public static function fetchHtml(string $url): string
|
||||||
|
{
|
||||||
|
try {
|
||||||
|
$response = Http::get($url);
|
||||||
|
|
||||||
|
if (!$response->successful()) {
|
||||||
|
throw new Exception("Failed to fetch URL: {$url} - Status: {$response->status()}");
|
||||||
|
}
|
||||||
|
|
||||||
|
return $response->body();
|
||||||
|
} catch (Exception $e) {
|
||||||
|
logger()->error('HTTP fetch failed', [
|
||||||
|
'url' => $url,
|
||||||
|
'error' => $e->getMessage()
|
||||||
|
]);
|
||||||
|
throw $e;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public static function fetchMultipleUrls(array $urls): array
|
||||||
|
{
|
||||||
|
try {
|
||||||
|
$responses = Http::pool(function ($pool) use ($urls) {
|
||||||
|
foreach ($urls as $url) {
|
||||||
|
$pool->get($url);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
return collect($responses)
|
||||||
|
->map(function ($response, $index) use ($urls) {
|
||||||
|
if (!isset($urls[$index])) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
$url = $urls[$index];
|
||||||
|
|
||||||
|
try {
|
||||||
|
if ($response->successful()) {
|
||||||
|
return [
|
||||||
|
'url' => $url,
|
||||||
|
'html' => $response->body(),
|
||||||
|
'success' => true
|
||||||
|
];
|
||||||
|
} else {
|
||||||
|
return [
|
||||||
|
'url' => $url,
|
||||||
|
'html' => null,
|
||||||
|
'success' => false,
|
||||||
|
'status' => $response->status()
|
||||||
|
];
|
||||||
|
}
|
||||||
|
} catch (Exception) {
|
||||||
|
return [
|
||||||
|
'url' => $url,
|
||||||
|
'html' => null,
|
||||||
|
'success' => false,
|
||||||
|
'error' => 'Exception occurred'
|
||||||
|
];
|
||||||
|
}
|
||||||
|
})
|
||||||
|
->filter(fn($result) => $result !== null)
|
||||||
|
->toArray();
|
||||||
|
} catch (Exception $e) {
|
||||||
|
logger()->error('Multiple URL fetch failed', ['error' => $e->getMessage()]);
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -1,8 +1,8 @@
|
||||||
<?php
|
<?php
|
||||||
|
|
||||||
namespace App\Services\Article;
|
namespace App\Services\Parsers;
|
||||||
|
|
||||||
class ArticleDataExtractor
|
class VrtArticlePageParser
|
||||||
{
|
{
|
||||||
public static function extractTitle(string $html): ?string
|
public static function extractTitle(string $html): ?string
|
||||||
{
|
{
|
||||||
|
|
@ -39,15 +39,6 @@ public static function extractDescription(string $html): ?string
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static function extractData(string $html): array
|
|
||||||
{
|
|
||||||
return [
|
|
||||||
'title' => self::extractTitle($html),
|
|
||||||
'description' => self::extractDescription($html),
|
|
||||||
'full_article' => self::extractFullArticle($html),
|
|
||||||
];
|
|
||||||
}
|
|
||||||
|
|
||||||
public static function extractFullArticle(string $html): ?string
|
public static function extractFullArticle(string $html): ?string
|
||||||
{
|
{
|
||||||
// Remove scripts, styles, and other non-content elements
|
// Remove scripts, styles, and other non-content elements
|
||||||
|
|
@ -72,4 +63,13 @@ public static function extractFullArticle(string $html): ?string
|
||||||
|
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static function extractData(string $html): array
|
||||||
|
{
|
||||||
|
return [
|
||||||
|
'title' => self::extractTitle($html),
|
||||||
|
'description' => self::extractDescription($html),
|
||||||
|
'full_article' => self::extractFullArticle($html),
|
||||||
|
];
|
||||||
|
}
|
||||||
}
|
}
|
||||||
23
app/Services/Parsers/VrtArticleParser.php
Normal file
23
app/Services/Parsers/VrtArticleParser.php
Normal file
|
|
@ -0,0 +1,23 @@
|
||||||
|
<?php
|
||||||
|
|
||||||
|
namespace App\Services\Parsers;
|
||||||
|
|
||||||
|
use App\Contracts\ArticleParserInterface;
|
||||||
|
|
||||||
|
class VrtArticleParser implements ArticleParserInterface
|
||||||
|
{
|
||||||
|
public function canParse(string $url): bool
|
||||||
|
{
|
||||||
|
return str_contains($url, 'vrt.be');
|
||||||
|
}
|
||||||
|
|
||||||
|
public function extractData(string $html): array
|
||||||
|
{
|
||||||
|
return VrtArticlePageParser::extractData($html);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function getSourceName(): string
|
||||||
|
{
|
||||||
|
return 'VRT News';
|
||||||
|
}
|
||||||
|
}
|
||||||
19
app/Services/Parsers/VrtHomepageParser.php
Normal file
19
app/Services/Parsers/VrtHomepageParser.php
Normal file
|
|
@ -0,0 +1,19 @@
|
||||||
|
<?php
|
||||||
|
|
||||||
|
namespace App\Services\Parsers;
|
||||||
|
|
||||||
|
class VrtHomepageParser
|
||||||
|
{
|
||||||
|
public static function extractArticleUrls(string $html): array
|
||||||
|
{
|
||||||
|
// Extract article links using regex
|
||||||
|
preg_match_all('/href="(\/vrtnws\/en\/\d{4}\/\d{2}\/\d{2}\/[^"]+)"/', $html, $matches);
|
||||||
|
|
||||||
|
$urls = collect($matches[1] ?? [])
|
||||||
|
->unique()
|
||||||
|
->map(fn ($path) => 'https://www.vrt.be' . $path)
|
||||||
|
->toArray();
|
||||||
|
|
||||||
|
return $urls;
|
||||||
|
}
|
||||||
|
}
|
||||||
28
app/Services/Parsers/VrtHomepageParserAdapter.php
Normal file
28
app/Services/Parsers/VrtHomepageParserAdapter.php
Normal file
|
|
@ -0,0 +1,28 @@
|
||||||
|
<?php
|
||||||
|
|
||||||
|
namespace App\Services\Parsers;
|
||||||
|
|
||||||
|
use App\Contracts\HomepageParserInterface;
|
||||||
|
|
||||||
|
class VrtHomepageParserAdapter implements HomepageParserInterface
|
||||||
|
{
|
||||||
|
public function canParse(string $url): bool
|
||||||
|
{
|
||||||
|
return str_contains($url, 'vrt.be');
|
||||||
|
}
|
||||||
|
|
||||||
|
public function extractArticleUrls(string $html): array
|
||||||
|
{
|
||||||
|
return VrtHomepageParser::extractArticleUrls($html);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function getHomepageUrl(): string
|
||||||
|
{
|
||||||
|
return 'https://www.vrt.be/vrtnws/en/';
|
||||||
|
}
|
||||||
|
|
||||||
|
public function getSourceName(): string
|
||||||
|
{
|
||||||
|
return 'VRT News';
|
||||||
|
}
|
||||||
|
}
|
||||||
Loading…
Reference in a new issue