diff --git a/app/Services/Factories/ArticleParserFactory.php b/app/Services/Factories/ArticleParserFactory.php index 0d53b9b..3496a60 100644 --- a/app/Services/Factories/ArticleParserFactory.php +++ b/app/Services/Factories/ArticleParserFactory.php @@ -4,12 +4,14 @@ use App\Contracts\ArticleParserInterface; use App\Services\Parsers\VrtArticleParser; +use App\Services\Parsers\BelgaArticleParser; use Exception; class ArticleParserFactory { private static array $parsers = [ VrtArticleParser::class, + BelgaArticleParser::class, ]; public static function getParser(string $url): ArticleParserInterface diff --git a/app/Services/Factories/HomepageParserFactory.php b/app/Services/Factories/HomepageParserFactory.php index 52f2127..f9d6ffc 100644 --- a/app/Services/Factories/HomepageParserFactory.php +++ b/app/Services/Factories/HomepageParserFactory.php @@ -4,12 +4,14 @@ use App\Contracts\HomepageParserInterface; use App\Services\Parsers\VrtHomepageParserAdapter; +use App\Services\Parsers\BelgaHomepageParserAdapter; use Exception; class HomepageParserFactory { private static array $parsers = [ VrtHomepageParserAdapter::class, + BelgaHomepageParserAdapter::class, ]; public static function getParser(string $url): HomepageParserInterface diff --git a/app/Services/Parsers/BelgaArticlePageParser.php b/app/Services/Parsers/BelgaArticlePageParser.php new file mode 100644 index 0000000..bfcb5af --- /dev/null +++ b/app/Services/Parsers/BelgaArticlePageParser.php @@ -0,0 +1,91 @@ +]*class="[^"]*prezly-slate-heading--heading-1[^"]*"[^>]*>([^<]+)<\/h1>/i', $html, $matches)) { + return html_entity_decode(strip_tags($matches[1]), ENT_QUOTES, 'UTF-8'); + } + + // Try meta title + if (preg_match('/]*>([^<]+)<\/h1>/i', $html, $matches)) { + return html_entity_decode(strip_tags($matches[1]), ENT_QUOTES, 'UTF-8'); + } + + // Try title tag + if (preg_match('/([^<]+)<\/title>/i', $html, $matches)) { + return html_entity_decode($matches[1], ENT_QUOTES, 'UTF-8'); + } + + return null; + } + + public static function extractDescription(string $html): ?string + { + // Try meta description first + if (preg_match('/<meta property="og:description" content="([^"]+)"/i', $html, $matches)) { + return html_entity_decode($matches[1], ENT_QUOTES, 'UTF-8'); + } + + // Try Belga-specific paragraph class + if (preg_match('/<p[^>]*class="[^"]*styles_paragraph__[^"]*"[^>]*>([^<]+(?:<[^\/](?!p)[^>]*>[^<]*<\/[^>]*>[^<]*)*)<\/p>/i', $html, $matches)) { + return html_entity_decode(strip_tags($matches[1]), ENT_QUOTES, 'UTF-8'); + } + + // Try to find first paragraph in article content + if (preg_match('/<p[^>]*>([^<]+(?:<[^\/](?!p)[^>]*>[^<]*<\/[^>]*>[^<]*)*)<\/p>/i', $html, $matches)) { + return html_entity_decode(strip_tags($matches[1]), ENT_QUOTES, 'UTF-8'); + } + + return null; + } + + public static function extractFullArticle(string $html): ?string + { + // Remove scripts, styles, and other non-content elements + $cleanHtml = preg_replace('/<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/mi', '', $html); + $cleanHtml = preg_replace('/<style\b[^<]*(?:(?!<\/style>)<[^<]*)*<\/style>/mi', '', $cleanHtml); + + // Try to extract content from Belga-specific document section + if (preg_match('/<section[^>]*class="[^"]*prezly-slate-document[^"]*"[^>]*>(.*?)<\/section>/is', $cleanHtml, $sectionMatches)) { + $sectionHtml = $sectionMatches[1]; + preg_match_all('/<p[^>]*>(.*?)<\/p>/is', $sectionHtml, $matches); + } else { + // Fallback: Extract all paragraph content + preg_match_all('/<p[^>]*>(.*?)<\/p>/is', $cleanHtml, $matches); + } + + if (!empty($matches[1])) { + $paragraphs = array_map(function($paragraph) { + return html_entity_decode(strip_tags($paragraph), ENT_QUOTES, 'UTF-8'); + }, $matches[1]); + + // Filter out empty paragraphs and join with double newlines + $fullText = implode("\n\n", array_filter($paragraphs, function($p) { + return trim($p) !== ''; + })); + + return $fullText ?: null; + } + + return null; + } + + public static function extractData(string $html): array + { + return [ + 'title' => self::extractTitle($html), + 'description' => self::extractDescription($html), + 'full_article' => self::extractFullArticle($html), + ]; + } +} \ No newline at end of file diff --git a/app/Services/Parsers/BelgaArticleParser.php b/app/Services/Parsers/BelgaArticleParser.php new file mode 100644 index 0000000..6463290 --- /dev/null +++ b/app/Services/Parsers/BelgaArticleParser.php @@ -0,0 +1,23 @@ +<?php + +namespace App\Services\Parsers; + +use App\Contracts\ArticleParserInterface; + +class BelgaArticleParser implements ArticleParserInterface +{ + public function canParse(string $url): bool + { + return str_contains($url, 'belganewsagency.eu'); + } + + public function extractData(string $html): array + { + return BelgaArticlePageParser::extractData($html); + } + + public function getSourceName(): string + { + return 'Belga News Agency'; + } +} \ No newline at end of file diff --git a/app/Services/Parsers/BelgaHomepageParser.php b/app/Services/Parsers/BelgaHomepageParser.php new file mode 100644 index 0000000..2606222 --- /dev/null +++ b/app/Services/Parsers/BelgaHomepageParser.php @@ -0,0 +1,18 @@ +<?php + +namespace App\Services\Parsers; + +class BelgaHomepageParser +{ + public static function extractArticleUrls(string $html): array + { + preg_match_all('/href="https:\/\/www\.belganewsagency\.eu\/([a-z0-9-]+)"/', $html, $matches); + + $urls = collect($matches[0] ?? []) + ->unique() + ->map(fn ($url) => str_replace('href="', '', str_replace('"', '', $url))) + ->toArray(); + + return $urls; + } +} \ No newline at end of file diff --git a/app/Services/Parsers/BelgaHomepageParserAdapter.php b/app/Services/Parsers/BelgaHomepageParserAdapter.php new file mode 100644 index 0000000..4613e76 --- /dev/null +++ b/app/Services/Parsers/BelgaHomepageParserAdapter.php @@ -0,0 +1,28 @@ +<?php + +namespace App\Services\Parsers; + +use App\Contracts\HomepageParserInterface; + +class BelgaHomepageParserAdapter implements HomepageParserInterface +{ + public function canParse(string $url): bool + { + return str_contains($url, 'belganewsagency.eu'); + } + + public function extractArticleUrls(string $html): array + { + return BelgaHomepageParser::extractArticleUrls($html); + } + + public function getHomepageUrl(): string + { + return 'https://www.belganewsagency.eu/'; + } + + public function getSourceName(): string + { + return 'Belga News Agency'; + } +} \ No newline at end of file