diff --git a/app/Services/Factories/ArticleParserFactory.php b/app/Services/Factories/ArticleParserFactory.php index 0d53b9b..3496a60 100644 --- a/app/Services/Factories/ArticleParserFactory.php +++ b/app/Services/Factories/ArticleParserFactory.php @@ -4,12 +4,14 @@ use App\Contracts\ArticleParserInterface; use App\Services\Parsers\VrtArticleParser; +use App\Services\Parsers\BelgaArticleParser; use Exception; class ArticleParserFactory { private static array $parsers = [ VrtArticleParser::class, + BelgaArticleParser::class, ]; public static function getParser(string $url): ArticleParserInterface diff --git a/app/Services/Factories/HomepageParserFactory.php b/app/Services/Factories/HomepageParserFactory.php index 52f2127..f9d6ffc 100644 --- a/app/Services/Factories/HomepageParserFactory.php +++ b/app/Services/Factories/HomepageParserFactory.php @@ -4,12 +4,14 @@ use App\Contracts\HomepageParserInterface; use App\Services\Parsers\VrtHomepageParserAdapter; +use App\Services\Parsers\BelgaHomepageParserAdapter; use Exception; class HomepageParserFactory { private static array $parsers = [ VrtHomepageParserAdapter::class, + BelgaHomepageParserAdapter::class, ]; public static function getParser(string $url): HomepageParserInterface diff --git a/app/Services/Parsers/BelgaArticlePageParser.php b/app/Services/Parsers/BelgaArticlePageParser.php new file mode 100644 index 0000000..bfcb5af --- /dev/null +++ b/app/Services/Parsers/BelgaArticlePageParser.php @@ -0,0 +1,91 @@ +]*class="[^"]*prezly-slate-heading--heading-1[^"]*"[^>]*>([^<]+)<\/h1>/i', $html, $matches)) { + return html_entity_decode(strip_tags($matches[1]), ENT_QUOTES, 'UTF-8'); + } + + // Try meta title + if (preg_match('/]*>([^<]+)<\/h1>/i', $html, $matches)) { + return html_entity_decode(strip_tags($matches[1]), ENT_QUOTES, 'UTF-8'); + } + + // Try title tag + if (preg_match('/
]*>([^<]+(?:<[^\/](?!p)[^>]*>[^<]*<\/[^>]*>[^<]*)*)<\/p>/i', $html, $matches)) { + return html_entity_decode(strip_tags($matches[1]), ENT_QUOTES, 'UTF-8'); + } + + return null; + } + + public static function extractFullArticle(string $html): ?string + { + // Remove scripts, styles, and other non-content elements + $cleanHtml = preg_replace('/