fedi-feed-router/app/Services/Article/ArticleFetcher.php

94 lines
2.7 KiB
PHP

<?php
namespace App\Services\Article;
use App\Models\Article;
use Exception;
use Illuminate\Support\Collection;
use Illuminate\Support\Facades\Http;
class ArticleFetcher
{
public static function getNewArticles(): Collection
{
return self::fetchArticles()
->map(fn (string $url) => self::saveArticle($url));
}
public static function fetchArticle(Article $article): array
{
try {
$response = Http::get($article->url);
if (!$response->successful()) {
logger()->error('Failed to fetch article', [
'url' => $article->url,
'status' => $response->status()
]);
return [];
}
$html = $response->body();
return ArticleDataExtractor::extractData($html);
} catch (Exception $e) {
logger()->error('Exception while fetching article', [
'url' => $article->url,
'error' => $e->getMessage()
]);
return $article;
}
}
private static function fetchArticles(): Collection
{
try {
$response = Http::get('https://www.vrt.be/vrtnws/en/');
$html = $response->body();
// Extract article links using regex
preg_match_all('/href="(\/vrtnws\/en\/\d{4}\/\d{2}\/\d{2}\/[^"]+)"/', $html, $matches);
$urls = collect($matches[1] ?? [])
->unique()
->map(fn ($path) => 'https://www.vrt.be' . $path)
->toArray();
$responses = Http::pool(function ($pool) use ($urls) {
foreach ($urls as $url) {
$pool->get($url);
}
});
return collect($responses)
->map(function ($response, $index) use ($urls) {
if (!isset($urls[$index])) {
return null;
}
$url = $urls[$index];
try {
if ($response->successful()) {
return $url;
} else {
return null;
}
} catch (Exception) {
return null;
}
})
->filter(fn($article) => !empty($article));
} catch (Exception $e) {
logger()->error("Failed to fetch VRT homepage", ['error' => $e->getMessage()]);
return new Collection([]);
}
}
private static function saveArticle(string $url): Article
{
return Article::firstOrCreate(['url' => $url]);
}
}