181 lines
5.1 KiB
PHP
181 lines
5.1 KiB
PHP
<?php
|
|
|
|
namespace App\Services\Article;
|
|
|
|
use App\Models\Article;
|
|
use App\Models\Feed;
|
|
use App\Services\Factories\ArticleParserFactory;
|
|
use App\Services\Factories\HomepageParserFactory;
|
|
use App\Services\Http\HttpFetcher;
|
|
use App\Services\Log\LogSaver;
|
|
use Exception;
|
|
use Illuminate\Support\Collection;
|
|
|
|
class ArticleFetcher
|
|
{
|
|
public function __construct(
|
|
private LogSaver $logSaver
|
|
) {}
|
|
|
|
/**
|
|
* @return Collection<int, Article>
|
|
*/
|
|
public function getArticlesFromFeed(Feed $feed): Collection
|
|
{
|
|
if ($feed->type === 'rss') {
|
|
return $this->getArticlesFromRssFeed($feed);
|
|
} elseif ($feed->type === 'website') {
|
|
return $this->getArticlesFromWebsiteFeed($feed);
|
|
}
|
|
|
|
$this->logSaver->warning('Unsupported feed type', null, [
|
|
'feed_id' => $feed->id,
|
|
'feed_type' => $feed->type,
|
|
]);
|
|
|
|
return collect();
|
|
}
|
|
|
|
/**
|
|
* @return Collection<int, Article>
|
|
*/
|
|
private function getArticlesFromRssFeed(Feed $feed): Collection
|
|
{
|
|
try {
|
|
$xml = HttpFetcher::fetchHtml($feed->url);
|
|
|
|
$previousUseErrors = libxml_use_internal_errors(true);
|
|
|
|
try {
|
|
$rss = simplexml_load_string($xml);
|
|
} finally {
|
|
libxml_clear_errors();
|
|
libxml_use_internal_errors($previousUseErrors);
|
|
}
|
|
|
|
if ($rss === false || ! isset($rss->channel->item)) {
|
|
$this->logSaver->warning('Failed to parse RSS feed XML', null, [
|
|
'feed_id' => $feed->id,
|
|
'feed_url' => $feed->url,
|
|
]);
|
|
|
|
return collect();
|
|
}
|
|
|
|
$articles = collect();
|
|
foreach ($rss->channel->item as $item) {
|
|
$link = (string) $item->link;
|
|
if ($link !== '') {
|
|
$articles->push($this->saveArticle($link, $feed->id));
|
|
}
|
|
}
|
|
|
|
return $articles;
|
|
} catch (Exception $e) {
|
|
$this->logSaver->error('Failed to fetch articles from RSS feed', null, [
|
|
'feed_id' => $feed->id,
|
|
'feed_url' => $feed->url,
|
|
'error' => $e->getMessage(),
|
|
]);
|
|
|
|
return collect();
|
|
}
|
|
}
|
|
|
|
/**
|
|
* @return Collection<int, Article>
|
|
*/
|
|
private function getArticlesFromWebsiteFeed(Feed $feed): Collection
|
|
{
|
|
try {
|
|
// Try to get parser for this feed
|
|
$parser = HomepageParserFactory::getParserForFeed($feed);
|
|
|
|
if (! $parser) {
|
|
$this->logSaver->warning('No parser available for feed URL', null, [
|
|
'feed_id' => $feed->id,
|
|
'feed_url' => $feed->url,
|
|
]);
|
|
|
|
return collect();
|
|
}
|
|
|
|
$html = HttpFetcher::fetchHtml($feed->url);
|
|
$urls = $parser->extractArticleUrls($html);
|
|
|
|
return collect($urls)
|
|
->map(fn (string $url) => $this->saveArticle($url, $feed->id));
|
|
|
|
} catch (Exception $e) {
|
|
$this->logSaver->error('Failed to fetch articles from website feed', null, [
|
|
'feed_id' => $feed->id,
|
|
'feed_url' => $feed->url,
|
|
'error' => $e->getMessage(),
|
|
]);
|
|
|
|
return collect();
|
|
}
|
|
}
|
|
|
|
/**
|
|
* @return array<string, mixed>
|
|
*/
|
|
public function fetchArticleData(Article $article): array
|
|
{
|
|
try {
|
|
$html = HttpFetcher::fetchHtml($article->url);
|
|
$parser = ArticleParserFactory::getParser($article->url);
|
|
|
|
return $parser->extractData($html);
|
|
} catch (Exception $e) {
|
|
$this->logSaver->error('Exception while fetching article data', null, [
|
|
'url' => $article->url,
|
|
'error' => $e->getMessage(),
|
|
]);
|
|
|
|
return [];
|
|
}
|
|
}
|
|
|
|
private function saveArticle(string $url, ?int $feedId = null): Article
|
|
{
|
|
$fallbackTitle = $this->generateFallbackTitle($url);
|
|
|
|
try {
|
|
$article = Article::firstOrCreate(
|
|
['url' => $url],
|
|
[
|
|
'feed_id' => $feedId,
|
|
'title' => $fallbackTitle,
|
|
]
|
|
);
|
|
|
|
if ($article->wasRecentlyCreated) {
|
|
$article->dispatchFetchedEvent();
|
|
}
|
|
|
|
return $article;
|
|
} catch (Exception $e) {
|
|
$this->logSaver->error('Failed to create article', null, [
|
|
'url' => $url,
|
|
'feed_id' => $feedId,
|
|
'error' => $e->getMessage(),
|
|
]);
|
|
throw $e;
|
|
}
|
|
}
|
|
|
|
private function generateFallbackTitle(string $url): string
|
|
{
|
|
// Extract filename from URL as a basic fallback title
|
|
$path = parse_url($url, PHP_URL_PATH);
|
|
$filename = basename($path ?: $url);
|
|
|
|
// Remove file extension and convert to readable format
|
|
$title = preg_replace('/\.[^.]*$/', '', $filename);
|
|
$title = str_replace(['-', '_'], ' ', $title);
|
|
$title = ucwords($title);
|
|
|
|
return $title ?: 'Untitled Article';
|
|
}
|
|
}
|