fedi-feed-router/app/Services/Article/ArticleFetcher.php

182 lines
5.1 KiB
PHP
Raw Normal View History

2025-06-29 09:37:49 +02:00
<?php
2025-06-29 17:46:06 +02:00
namespace App\Services\Article;
2025-06-29 09:37:49 +02:00
use App\Models\Article;
2025-07-05 18:26:04 +02:00
use App\Models\Feed;
2025-06-29 21:33:18 +02:00
use App\Services\Factories\ArticleParserFactory;
use App\Services\Factories\HomepageParserFactory;
use App\Services\Http\HttpFetcher;
2025-07-05 18:26:04 +02:00
use App\Services\Log\LogSaver;
2025-06-29 09:37:49 +02:00
use Exception;
2025-06-29 17:13:18 +02:00
use Illuminate\Support\Collection;
2025-06-29 09:37:49 +02:00
class ArticleFetcher
{
2025-08-15 02:50:42 +02:00
public function __construct(
private LogSaver $logSaver
) {}
2025-07-07 00:51:32 +02:00
/**
* @return Collection<int, Article>
*/
2025-08-15 02:50:42 +02:00
public function getArticlesFromFeed(Feed $feed): Collection
2025-06-29 21:20:45 +02:00
{
2025-07-05 18:26:04 +02:00
if ($feed->type === 'rss') {
2025-08-15 02:50:42 +02:00
return $this->getArticlesFromRssFeed($feed);
2025-07-05 18:26:04 +02:00
} elseif ($feed->type === 'website') {
2025-08-15 02:50:42 +02:00
return $this->getArticlesFromWebsiteFeed($feed);
2025-07-05 18:26:04 +02:00
}
$this->logSaver->warning('Unsupported feed type', null, [
2025-07-05 18:26:04 +02:00
'feed_id' => $feed->id,
'feed_type' => $feed->type,
2025-07-05 18:26:04 +02:00
]);
return collect();
}
2025-07-07 00:51:32 +02:00
/**
* @return Collection<int, Article>
*/
2025-08-15 02:50:42 +02:00
private function getArticlesFromRssFeed(Feed $feed): Collection
2025-07-05 18:26:04 +02:00
{
try {
$xml = HttpFetcher::fetchHtml($feed->url);
$previousUseErrors = libxml_use_internal_errors(true);
try {
$rss = simplexml_load_string($xml);
} finally {
libxml_clear_errors();
libxml_use_internal_errors($previousUseErrors);
}
if ($rss === false || ! isset($rss->channel->item)) {
$this->logSaver->warning('Failed to parse RSS feed XML', null, [
'feed_id' => $feed->id,
'feed_url' => $feed->url,
]);
return collect();
}
$articles = collect();
foreach ($rss->channel->item as $item) {
$link = (string) $item->link;
if ($link !== '') {
$articles->push($this->saveArticle($link, $feed->id));
}
}
return $articles;
} catch (Exception $e) {
$this->logSaver->error('Failed to fetch articles from RSS feed', null, [
'feed_id' => $feed->id,
'feed_url' => $feed->url,
'error' => $e->getMessage(),
]);
return collect();
}
2025-07-05 18:26:04 +02:00
}
2025-06-29 21:20:45 +02:00
2025-07-07 00:51:32 +02:00
/**
* @return Collection<int, Article>
*/
2025-08-15 02:50:42 +02:00
private function getArticlesFromWebsiteFeed(Feed $feed): Collection
2025-07-05 18:26:04 +02:00
{
try {
// Try to get parser for this feed
$parser = HomepageParserFactory::getParserForFeed($feed);
2025-06-29 21:20:45 +02:00
2025-07-05 18:26:04 +02:00
if (! $parser) {
$this->logSaver->warning('No parser available for feed URL', null, [
2025-07-05 18:26:04 +02:00
'feed_id' => $feed->id,
'feed_url' => $feed->url,
2025-07-05 18:26:04 +02:00
]);
2025-06-29 21:20:45 +02:00
2025-07-05 18:26:04 +02:00
return collect();
2025-06-29 21:33:18 +02:00
}
2025-06-29 21:20:45 +02:00
2025-07-05 18:26:04 +02:00
$html = HttpFetcher::fetchHtml($feed->url);
$urls = $parser->extractArticleUrls($html);
return collect($urls)
2025-08-15 02:50:42 +02:00
->map(fn (string $url) => $this->saveArticle($url, $feed->id));
2025-07-05 18:26:04 +02:00
2025-06-29 21:20:45 +02:00
} catch (Exception $e) {
$this->logSaver->error('Failed to fetch articles from website feed', null, [
2025-07-05 18:26:04 +02:00
'feed_id' => $feed->id,
'feed_url' => $feed->url,
'error' => $e->getMessage(),
2025-07-05 18:26:04 +02:00
]);
2025-07-03 21:34:39 +02:00
2025-07-05 18:26:04 +02:00
return collect();
2025-06-29 21:20:45 +02:00
}
}
2025-07-07 00:51:32 +02:00
/**
* @return array<string, mixed>
*/
2025-08-15 02:50:42 +02:00
public function fetchArticleData(Article $article): array
2025-06-29 09:37:49 +02:00
{
try {
2025-06-29 21:33:18 +02:00
$html = HttpFetcher::fetchHtml($article->url);
$parser = ArticleParserFactory::getParser($article->url);
2025-06-29 09:37:49 +02:00
2025-06-29 21:33:18 +02:00
return $parser->extractData($html);
2025-06-29 09:37:49 +02:00
} catch (Exception $e) {
2025-08-15 02:50:42 +02:00
$this->logSaver->error('Exception while fetching article data', null, [
2025-06-29 21:33:18 +02:00
'url' => $article->url,
'error' => $e->getMessage(),
2025-06-29 21:33:18 +02:00
]);
2025-07-03 21:34:39 +02:00
2025-06-29 21:33:18 +02:00
return [];
2025-06-29 09:37:49 +02:00
}
}
2025-08-15 02:50:42 +02:00
private function saveArticle(string $url, ?int $feedId = null): Article
2025-06-29 09:37:49 +02:00
{
2025-08-15 02:50:42 +02:00
$fallbackTitle = $this->generateFallbackTitle($url);
2025-08-10 21:18:20 +02:00
try {
$article = Article::firstOrCreate(
['url' => $url],
[
'feed_id' => $feedId,
'title' => $fallbackTitle,
]
);
if ($article->wasRecentlyCreated) {
$article->dispatchFetchedEvent();
}
return $article;
} catch (Exception $e) {
$this->logSaver->error('Failed to create article', null, [
2025-08-10 21:18:20 +02:00
'url' => $url,
'feed_id' => $feedId,
'error' => $e->getMessage(),
]);
throw $e;
}
}
2025-08-15 02:50:42 +02:00
private function generateFallbackTitle(string $url): string
2025-08-10 21:18:20 +02:00
{
// Extract filename from URL as a basic fallback title
$path = parse_url($url, PHP_URL_PATH);
$filename = basename($path ?: $url);
2025-08-10 21:18:20 +02:00
// Remove file extension and convert to readable format
$title = preg_replace('/\.[^.]*$/', '', $filename);
$title = str_replace(['-', '_'], ' ', $title);
$title = ucwords($title);
2025-08-10 21:18:20 +02:00
return $title ?: 'Untitled Article';
2025-06-29 09:37:49 +02:00
}
}