diff --git a/app/Console/Commands/FetchArticleCommand.php b/app/Console/Commands/FetchArticleCommand.php new file mode 100644 index 0000000..0b28190 --- /dev/null +++ b/app/Console/Commands/FetchArticleCommand.php @@ -0,0 +1,27 @@ + $this->argument('url'), + ]); + + $res = ArticleFetcher::fetchArticleData($article); + + dump($res); + + return self::SUCCESS; + } +} diff --git a/app/Console/Commands/PublishToLemmyCommand.php b/app/Console/Commands/PublishToLemmyCommand.php index 9fd5ff4..4afe928 100644 --- a/app/Console/Commands/PublishToLemmyCommand.php +++ b/app/Console/Commands/PublishToLemmyCommand.php @@ -21,7 +21,7 @@ public function handle(): int $this->info('Publishing article: ' . $article->url); try { - LemmyPublisher::fromConfig()->publish($article, ArticleFetcher::fetchArticle($article)); + LemmyPublisher::fromConfig()->publish($article, ArticleFetcher::fetchArticleData($article)); } catch (Exception) { return self::FAILURE; } diff --git a/app/Contracts/ArticleParserInterface.php b/app/Contracts/ArticleParserInterface.php new file mode 100644 index 0000000..14b651a --- /dev/null +++ b/app/Contracts/ArticleParserInterface.php @@ -0,0 +1,21 @@ +id . ' : ' . $article->url); - LemmyPublisher::fromConfig()->publish($article, ArticleFetcher::fetchArticle($article)); + LemmyPublisher::fromConfig()->publish($article, ArticleFetcher::fetchArticleData($article)); } } diff --git a/app/Services/Article/ArticleFetcher.php b/app/Services/Article/ArticleFetcher.php index 2e8220a..d32c04d 100644 --- a/app/Services/Article/ArticleFetcher.php +++ b/app/Services/Article/ArticleFetcher.php @@ -3,87 +3,49 @@ namespace App\Services\Article; use App\Models\Article; +use App\Services\Http\HttpFetcher; +use App\Services\Factories\ArticleParserFactory; +use App\Services\Factories\HomepageParserFactory; use Exception; use Illuminate\Support\Collection; -use Illuminate\Support\Facades\Http; class ArticleFetcher { public static function getNewArticles(): Collection - { - return self::fetchArticles() - ->map(fn (string $url) => self::saveArticle($url)); - } - - public static function fetchArticle(Article $article): array { try { - $response = Http::get($article->url); + $allArticles = collect(); - if (!$response->successful()) { - logger()->error('Failed to fetch article', [ - 'url' => $article->url, - 'status' => $response->status() - ]); - return []; + foreach (HomepageParserFactory::getAllParsers() as $parser) { + $html = HttpFetcher::fetchHtml($parser->getHomepageUrl()); + $urls = $parser->extractArticleUrls($html); + + $articles = collect($urls) + ->map(fn (string $url) => self::saveArticle($url)); + + $allArticles = $allArticles->merge($articles); } - $html = $response->body(); - - return ArticleDataExtractor::extractData($html); - + return $allArticles->filter(); } catch (Exception $e) { - logger()->error('Exception while fetching article', [ - 'url' => $article->url, - 'error' => $e->getMessage() - ]); - return $article; + logger()->error("Failed to get new articles", ['error' => $e->getMessage()]); + return new Collection([]); } } - - private static function fetchArticles(): Collection + public static function fetchArticleData(Article $article): array { try { - $response = Http::get('https://www.vrt.be/vrtnws/en/'); - $html = $response->body(); + $html = HttpFetcher::fetchHtml($article->url); + $parser = ArticleParserFactory::getParser($article->url); - // Extract article links using regex - preg_match_all('/href="(\/vrtnws\/en\/\d{4}\/\d{2}\/\d{2}\/[^"]+)"/', $html, $matches); - - $urls = collect($matches[1] ?? []) - ->unique() - ->map(fn ($path) => 'https://www.vrt.be' . $path) - ->toArray(); - - $responses = Http::pool(function ($pool) use ($urls) { - foreach ($urls as $url) { - $pool->get($url); - } - }); - - return collect($responses) - ->map(function ($response, $index) use ($urls) { - if (!isset($urls[$index])) { - return null; - } - - $url = $urls[$index]; - - try { - if ($response->successful()) { - return $url; - } else { - return null; - } - } catch (Exception) { - return null; - } - }) - ->filter(fn($article) => !empty($article)); + return $parser->extractData($html); } catch (Exception $e) { - logger()->error("Failed to fetch VRT homepage", ['error' => $e->getMessage()]); - return new Collection([]); + logger()->error('Exception while fetching article data', [ + 'url' => $article->url, + 'error' => $e->getMessage() + ]); + return []; } } diff --git a/app/Services/Article/ValidationService.php b/app/Services/Article/ValidationService.php index aa085a7..c2dee91 100644 --- a/app/Services/Article/ValidationService.php +++ b/app/Services/Article/ValidationService.php @@ -10,11 +10,29 @@ public static function validate(Article $article): Article { logger('Checking keywords for article: ' . $article->id); + $articleData = ArticleFetcher::fetchArticleData($article); + $validationResult = self::validateByKeywords($articleData['full_article']); + $article->update([ - 'is_valid' => true, + 'is_valid' => $validationResult, 'validated_at' => now(), ]); return $article->refresh(); } + + private static function validateByKeywords(string $full_article): bool + { + $keywords = [ + 'N-VA', 'Bart De Wever', 'Frank Vandenbroucke', + ]; + + foreach ($keywords as $keyword) { + if (stripos($full_article, $keyword) !== false) { + return true; + } + } + + return false; + } } diff --git a/app/Services/Factories/ArticleParserFactory.php b/app/Services/Factories/ArticleParserFactory.php new file mode 100644 index 0000000..0d53b9b --- /dev/null +++ b/app/Services/Factories/ArticleParserFactory.php @@ -0,0 +1,42 @@ +canParse($url)) { + return $parser; + } + } + + throw new Exception("No parser found for URL: {$url}"); + } + + public static function getSupportedSources(): array + { + return array_map(function($parserClass) { + $parser = new $parserClass(); + return $parser->getSourceName(); + }, self::$parsers); + } + + public static function registerParser(string $parserClass): void + { + if (!in_array($parserClass, self::$parsers)) { + self::$parsers[] = $parserClass; + } + } +} \ No newline at end of file diff --git a/app/Services/Factories/HomepageParserFactory.php b/app/Services/Factories/HomepageParserFactory.php new file mode 100644 index 0000000..52f2127 --- /dev/null +++ b/app/Services/Factories/HomepageParserFactory.php @@ -0,0 +1,47 @@ +canParse($url)) { + return $parser; + } + } + + throw new Exception("No homepage parser found for URL: {$url}"); + } + + public static function getAllParsers(): array + { + return array_map(fn($parserClass) => new $parserClass(), self::$parsers); + } + + public static function getSupportedSources(): array + { + return array_map(function($parserClass) { + $parser = new $parserClass(); + return $parser->getSourceName(); + }, self::$parsers); + } + + public static function registerParser(string $parserClass): void + { + if (!in_array($parserClass, self::$parsers)) { + self::$parsers[] = $parserClass; + } + } +} \ No newline at end of file diff --git a/app/Services/Http/HttpFetcher.php b/app/Services/Http/HttpFetcher.php new file mode 100644 index 0000000..1d263b6 --- /dev/null +++ b/app/Services/Http/HttpFetcher.php @@ -0,0 +1,77 @@ +successful()) { + throw new Exception("Failed to fetch URL: {$url} - Status: {$response->status()}"); + } + + return $response->body(); + } catch (Exception $e) { + logger()->error('HTTP fetch failed', [ + 'url' => $url, + 'error' => $e->getMessage() + ]); + throw $e; + } + } + + public static function fetchMultipleUrls(array $urls): array + { + try { + $responses = Http::pool(function ($pool) use ($urls) { + foreach ($urls as $url) { + $pool->get($url); + } + }); + + return collect($responses) + ->map(function ($response, $index) use ($urls) { + if (!isset($urls[$index])) { + return null; + } + + $url = $urls[$index]; + + try { + if ($response->successful()) { + return [ + 'url' => $url, + 'html' => $response->body(), + 'success' => true + ]; + } else { + return [ + 'url' => $url, + 'html' => null, + 'success' => false, + 'status' => $response->status() + ]; + } + } catch (Exception) { + return [ + 'url' => $url, + 'html' => null, + 'success' => false, + 'error' => 'Exception occurred' + ]; + } + }) + ->filter(fn($result) => $result !== null) + ->toArray(); + } catch (Exception $e) { + logger()->error('Multiple URL fetch failed', ['error' => $e->getMessage()]); + return []; + } + } +} \ No newline at end of file diff --git a/app/Services/Article/ArticleDataExtractor.php b/app/Services/Parsers/VrtArticlePageParser.php similarity index 97% rename from app/Services/Article/ArticleDataExtractor.php rename to app/Services/Parsers/VrtArticlePageParser.php index aa8cb96..98c1c6c 100644 --- a/app/Services/Article/ArticleDataExtractor.php +++ b/app/Services/Parsers/VrtArticlePageParser.php @@ -1,8 +1,8 @@ self::extractTitle($html), - 'description' => self::extractDescription($html), - 'full_article' => self::extractFullArticle($html), - ]; - } - public static function extractFullArticle(string $html): ?string { // Remove scripts, styles, and other non-content elements @@ -72,4 +63,13 @@ public static function extractFullArticle(string $html): ?string return null; } + + public static function extractData(string $html): array + { + return [ + 'title' => self::extractTitle($html), + 'description' => self::extractDescription($html), + 'full_article' => self::extractFullArticle($html), + ]; + } } \ No newline at end of file diff --git a/app/Services/Parsers/VrtArticleParser.php b/app/Services/Parsers/VrtArticleParser.php new file mode 100644 index 0000000..a86199f --- /dev/null +++ b/app/Services/Parsers/VrtArticleParser.php @@ -0,0 +1,23 @@ +unique() + ->map(fn ($path) => 'https://www.vrt.be' . $path) + ->toArray(); + + return $urls; + } +} \ No newline at end of file diff --git a/app/Services/Parsers/VrtHomepageParserAdapter.php b/app/Services/Parsers/VrtHomepageParserAdapter.php new file mode 100644 index 0000000..3d1db3a --- /dev/null +++ b/app/Services/Parsers/VrtHomepageParserAdapter.php @@ -0,0 +1,28 @@ +