Add multiple sources

2025-06-29 21:39:28 +02:00 · 2025-06-29 21:39:28 +02:00 · 83194cd64b
commit 83194cd64b
parent 788d649276
6 changed files with 164 additions and 0 deletions
--- a/app/Services/Factories/ArticleParserFactory.php
+++ b/app/Services/Factories/ArticleParserFactory.php
@ -4,12 +4,14 @@

 use App\Contracts\ArticleParserInterface;
 use App\Services\Parsers\VrtArticleParser;
+use App\Services\Parsers\BelgaArticleParser;
 use Exception;

 class ArticleParserFactory
 {
    private static array $parsers = [
        VrtArticleParser::class,
+        BelgaArticleParser::class,
    ];

    public static function getParser(string $url): ArticleParserInterface
--- a/app/Services/Factories/HomepageParserFactory.php
+++ b/app/Services/Factories/HomepageParserFactory.php
@ -4,12 +4,14 @@

 use App\Contracts\HomepageParserInterface;
 use App\Services\Parsers\VrtHomepageParserAdapter;
+use App\Services\Parsers\BelgaHomepageParserAdapter;
 use Exception;

 class HomepageParserFactory
 {
    private static array $parsers = [
        VrtHomepageParserAdapter::class,
+        BelgaHomepageParserAdapter::class,
    ];

    public static function getParser(string $url): HomepageParserInterface
--- a/app/Services/Parsers/BelgaArticlePageParser.php
+++ b/app/Services/Parsers/BelgaArticlePageParser.php
@ -0,0 +1,91 @@
+<?php
+
+namespace App\Services\Parsers;
+
+class BelgaArticlePageParser
+{
+    public static function extractTitle(string $html): ?string
+    {
+        // Try h1 with Belga-specific class first
+        if (preg_match('/<h1[^>]*class="[^"]*prezly-slate-heading--heading-1[^"]*"[^>]*>([^<]+)<\/h1>/i', $html, $matches)) {
+            return html_entity_decode(strip_tags($matches[1]), ENT_QUOTES, 'UTF-8');
+        }
+        
+        // Try meta title
+        if (preg_match('/<meta property="og:title" content="([^"]+)"/i', $html, $matches)) {
+            return html_entity_decode($matches[1], ENT_QUOTES, 'UTF-8');
+        }
+        
+        // Try any h1 tag
+        if (preg_match('/<h1[^>]*>([^<]+)<\/h1>/i', $html, $matches)) {
+            return html_entity_decode(strip_tags($matches[1]), ENT_QUOTES, 'UTF-8');
+        }
+        
+        // Try title tag
+        if (preg_match('/<title>([^<]+)<\/title>/i', $html, $matches)) {
+            return html_entity_decode($matches[1], ENT_QUOTES, 'UTF-8');
+        }
+        
+        return null;
+    }
+    
+    public static function extractDescription(string $html): ?string
+    {
+        // Try meta description first
+        if (preg_match('/<meta property="og:description" content="([^"]+)"/i', $html, $matches)) {
+            return html_entity_decode($matches[1], ENT_QUOTES, 'UTF-8');
+        }
+        
+        // Try Belga-specific paragraph class
+        if (preg_match('/<p[^>]*class="[^"]*styles_paragraph__[^"]*"[^>]*>([^<]+(?:<[^\/](?!p)[^>]*>[^<]*<\/[^>]*>[^<]*)*)<\/p>/i', $html, $matches)) {
+            return html_entity_decode(strip_tags($matches[1]), ENT_QUOTES, 'UTF-8');
+        }
+        
+        // Try to find first paragraph in article content
+        if (preg_match('/<p[^>]*>([^<]+(?:<[^\/](?!p)[^>]*>[^<]*<\/[^>]*>[^<]*)*)<\/p>/i', $html, $matches)) {
+            return html_entity_decode(strip_tags($matches[1]), ENT_QUOTES, 'UTF-8');
+        }
+        
+        return null;
+    }
+    
+    public static function extractFullArticle(string $html): ?string
+    {
+        // Remove scripts, styles, and other non-content elements
+        $cleanHtml = preg_replace('/<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/mi', '', $html);
+        $cleanHtml = preg_replace('/<style\b[^<]*(?:(?!<\/style>)<[^<]*)*<\/style>/mi', '', $cleanHtml);
+        
+        // Try to extract content from Belga-specific document section
+        if (preg_match('/<section[^>]*class="[^"]*prezly-slate-document[^"]*"[^>]*>(.*?)<\/section>/is', $cleanHtml, $sectionMatches)) {
+            $sectionHtml = $sectionMatches[1];
+            preg_match_all('/<p[^>]*>(.*?)<\/p>/is', $sectionHtml, $matches);
+        } else {
+            // Fallback: Extract all paragraph content
+            preg_match_all('/<p[^>]*>(.*?)<\/p>/is', $cleanHtml, $matches);
+        }
+        
+        if (!empty($matches[1])) {
+            $paragraphs = array_map(function($paragraph) {
+                return html_entity_decode(strip_tags($paragraph), ENT_QUOTES, 'UTF-8');
+            }, $matches[1]);
+            
+            // Filter out empty paragraphs and join with double newlines
+            $fullText = implode("\n\n", array_filter($paragraphs, function($p) {
+                return trim($p) !== '';
+            }));
+            
+            return $fullText ?: null;
+        }
+        
+        return null;
+    }
+    
+    public static function extractData(string $html): array
+    {
+        return [
+            'title' => self::extractTitle($html),
+            'description' => self::extractDescription($html),
+            'full_article' => self::extractFullArticle($html),
+        ];
+    }
+}
--- a/app/Services/Parsers/BelgaArticleParser.php
+++ b/app/Services/Parsers/BelgaArticleParser.php
@ -0,0 +1,23 @@
+<?php
+
+namespace App\Services\Parsers;
+
+use App\Contracts\ArticleParserInterface;
+
+class BelgaArticleParser implements ArticleParserInterface
+{
+    public function canParse(string $url): bool
+    {
+        return str_contains($url, 'belganewsagency.eu');
+    }
+
+    public function extractData(string $html): array
+    {
+        return BelgaArticlePageParser::extractData($html);
+    }
+
+    public function getSourceName(): string
+    {
+        return 'Belga News Agency';
+    }
+}
--- a/app/Services/Parsers/BelgaHomepageParser.php
+++ b/app/Services/Parsers/BelgaHomepageParser.php
@ -0,0 +1,18 @@
+<?php
+
+namespace App\Services\Parsers;
+
+class BelgaHomepageParser
+{
+    public static function extractArticleUrls(string $html): array
+    {
+        preg_match_all('/href="https:\/\/www\.belganewsagency\.eu\/([a-z0-9-]+)"/', $html, $matches);
+
+        $urls = collect($matches[0] ?? [])
+            ->unique()
+            ->map(fn ($url) => str_replace('href="', '', str_replace('"', '', $url)))
+            ->toArray();
+
+        return $urls;
+    }
+}
--- a/app/Services/Parsers/BelgaHomepageParserAdapter.php
+++ b/app/Services/Parsers/BelgaHomepageParserAdapter.php
@ -0,0 +1,28 @@
+<?php
+
+namespace App\Services\Parsers;
+
+use App\Contracts\HomepageParserInterface;
+
+class BelgaHomepageParserAdapter implements HomepageParserInterface
+{
+    public function canParse(string $url): bool
+    {
+        return str_contains($url, 'belganewsagency.eu');
+    }
+
+    public function extractArticleUrls(string $html): array
+    {
+        return BelgaHomepageParser::extractArticleUrls($html);
+    }
+
+    public function getHomepageUrl(): string
+    {
+        return 'https://www.belganewsagency.eu/';
+    }
+
+    public function getSourceName(): string
+    {
+        return 'Belga News Agency';
+    }
+}