fedi-feed-router/app/Services/Parsers/BelgaArticlePageParser.php

<?php

namespace App\Services\Parsers;

class BelgaArticlePageParser
{
    public static function extractTitle(string $html): ?string
    {
        // Try h1 with Belga-specific class first
        if (preg_match('/<h1[^>]*class="[^"]*prezly-slate-heading--heading-1[^"]*"[^>]*>([^<]+)<\/h1>/i', $html, $matches)) {
            return html_entity_decode(strip_tags($matches[1]), ENT_QUOTES, 'UTF-8');
        }
        
        // Try meta title
        if (preg_match('/<meta property="og:title" content="([^"]+)"/i', $html, $matches)) {
            return html_entity_decode($matches[1], ENT_QUOTES, 'UTF-8');
        }
        
        // Try any h1 tag
        if (preg_match('/<h1[^>]*>([^<]+)<\/h1>/i', $html, $matches)) {
            return html_entity_decode(strip_tags($matches[1]), ENT_QUOTES, 'UTF-8');
        }
        
        // Try title tag
        if (preg_match('/<title>([^<]+)<\/title>/i', $html, $matches)) {
            return html_entity_decode($matches[1], ENT_QUOTES, 'UTF-8');
        }
        
        return null;
    }
    
    public static function extractDescription(string $html): ?string
    {
        // Try meta description first
        if (preg_match('/<meta property="og:description" content="([^"]+)"/i', $html, $matches)) {
            return html_entity_decode($matches[1], ENT_QUOTES, 'UTF-8');
        }
        
        // Try Belga-specific paragraph class
        if (preg_match('/<p[^>]*class="[^"]*styles_paragraph__[^"]*"[^>]*>([^<]+(?:<[^\/](?!p)[^>]*>[^<]*<\/[^>]*>[^<]*)*)<\/p>/i', $html, $matches)) {
            return html_entity_decode(strip_tags($matches[1]), ENT_QUOTES, 'UTF-8');
        }
        
        // Try to find first paragraph in article content
        if (preg_match('/<p[^>]*>([^<]+(?:<[^\/](?!p)[^>]*>[^<]*<\/[^>]*>[^<]*)*)<\/p>/i', $html, $matches)) {
            return html_entity_decode(strip_tags($matches[1]), ENT_QUOTES, 'UTF-8');
        }
        
        return null;
    }
    
    public static function extractFullArticle(string $html): ?string
    {
        // Remove scripts, styles, and other non-content elements
        $cleanHtml = preg_replace('/<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/mi', '', $html);
        $cleanHtml = preg_replace('/<style\b[^<]*(?:(?!<\/style>)<[^<]*)*<\/style>/mi', '', $cleanHtml);
        
        // Try to extract content from Belga-specific document section
        if (preg_match('/<section[^>]*class="[^"]*prezly-slate-document[^"]*"[^>]*>(.*?)<\/section>/is', $cleanHtml, $sectionMatches)) {
            $sectionHtml = $sectionMatches[1];
            preg_match_all('/<p[^>]*>(.*?)<\/p>/is', $sectionHtml, $matches);
        } else {
            // Fallback: Extract all paragraph content
            preg_match_all('/<p[^>]*>(.*?)<\/p>/is', $cleanHtml, $matches);
        }
        
        if (!empty($matches[1])) {
            $paragraphs = array_map(function($paragraph) {
                return html_entity_decode(strip_tags($paragraph), ENT_QUOTES, 'UTF-8');
            }, $matches[1]);
            
            // Filter out empty paragraphs and join with double newlines
            $fullText = implode("\n\n", array_filter($paragraphs, function($p) {
                return trim($p) !== '';
            }));
            
            return $fullText ?: null;
        }
        
        return null;
    }
    
    public static function extractThumbnail(string $html): ?string
    {
        // Try OpenGraph image first
        if (preg_match('/<meta property="og:image" content="([^"]+)"/i', $html, $matches)) {
            return $matches[1];
        }
        
        // Try first image in article content
        if (preg_match('/<img[^>]+src="([^"]+)"/i', $html, $matches)) {
            return $matches[1];
        }
        
        return null;
    }

    public static function extractData(string $html): array
    {
        return [
            'title' => self::extractTitle($html),
            'description' => self::extractDescription($html),
            'full_article' => self::extractFullArticle($html),
            'thumbnail' => self::extractThumbnail($html),
        ];
    }
}
Add multiple sources 2025-06-29 21:39:28 +02:00			`<?php`

			`namespace App\Services\Parsers;`

			`class BelgaArticlePageParser`
			`{`
			`public static function extractTitle(string $html): ?string`
			`{`
			`// Try h1 with Belga-specific class first`
			`if (preg_match('/<h1[^>]class="[^"]prezly-slate-heading--heading-1[^"]"[^>]>([^<]+)<\/h1>/i', $html, $matches)) {`
			`return html_entity_decode(strip_tags($matches[1]), ENT_QUOTES, 'UTF-8');`
			`}`

			`// Try meta title`
			`if (preg_match('/<meta property="og:title" content="([^"]+)"/i', $html, $matches)) {`
			`return html_entity_decode($matches[1], ENT_QUOTES, 'UTF-8');`
			`}`

			`// Try any h1 tag`
			`if (preg_match('/<h1[^>]*>([^<]+)<\/h1>/i', $html, $matches)) {`
			`return html_entity_decode(strip_tags($matches[1]), ENT_QUOTES, 'UTF-8');`
			`}`

			`// Try title tag`
			`if (preg_match('/<title>([^<]+)<\/title>/i', $html, $matches)) {`
			`return html_entity_decode($matches[1], ENT_QUOTES, 'UTF-8');`
			`}`

			`return null;`
			`}`

			`public static function extractDescription(string $html): ?string`
			`{`
			`// Try meta description first`
			`if (preg_match('/<meta property="og:description" content="([^"]+)"/i', $html, $matches)) {`
			`return html_entity_decode($matches[1], ENT_QUOTES, 'UTF-8');`
			`}`

			`// Try Belga-specific paragraph class`
			`if (preg_match('/<p[^>]class="[^"]styles_paragraph__[^"]"[^>]>([^<]+(?:<[^\/](?!p)[^>]>[^<]<\/[^>]>[^<])*)<\/p>/i', $html, $matches)) {`
			`return html_entity_decode(strip_tags($matches[1]), ENT_QUOTES, 'UTF-8');`
			`}`

			`// Try to find first paragraph in article content`
			`if (preg_match('/<p[^>]>([^<]+(?:<[^\/](?!p)[^>]>[^<]<\/[^>]>[^<]))<\/p>/i', $html, $matches)) {`
			`return html_entity_decode(strip_tags($matches[1]), ENT_QUOTES, 'UTF-8');`
			`}`

			`return null;`
			`}`

			`public static function extractFullArticle(string $html): ?string`
			`{`
			`// Remove scripts, styles, and other non-content elements`
			`$cleanHtml = preg_replace('/<script\b[^<](?:(?!<\/script>)<[^<])*<\/script>/mi', '', $html);`
			`$cleanHtml = preg_replace('/<style\b[^<](?:(?!<\/style>)<[^<])*<\/style>/mi', '', $cleanHtml);`

			`// Try to extract content from Belga-specific document section`
			`if (preg_match('/<section[^>]class="[^"]prezly-slate-document[^"]"[^>]>(.*?)<\/section>/is', $cleanHtml, $sectionMatches)) {`
			`$sectionHtml = $sectionMatches[1];`
			`preg_match_all('/<p[^>]>(.?)<\/p>/is', $sectionHtml, $matches);`
			`} else {`
			`// Fallback: Extract all paragraph content`
			`preg_match_all('/<p[^>]>(.?)<\/p>/is', $cleanHtml, $matches);`
			`}`

			`if (!empty($matches[1])) {`
			`$paragraphs = array_map(function($paragraph) {`
			`return html_entity_decode(strip_tags($paragraph), ENT_QUOTES, 'UTF-8');`
			`}, $matches[1]);`

			`// Filter out empty paragraphs and join with double newlines`
			`$fullText = implode("\n\n", array_filter($paragraphs, function($p) {`
			`return trim($p) !== '';`
			`}));`

			`return $fullText ?: null;`
			`}`

			`return null;`
			`}`

Split sources, refine publisher 2025-06-30 18:18:30 +02:00			`public static function extractThumbnail(string $html): ?string`
			`{`
			`// Try OpenGraph image first`
			`if (preg_match('/<meta property="og:image" content="([^"]+)"/i', $html, $matches)) {`
			`return $matches[1];`
			`}`

			`// Try first image in article content`
			`if (preg_match('/<img[^>]+src="([^"]+)"/i', $html, $matches)) {`
			`return $matches[1];`
			`}`

			`return null;`
			`}`

Add multiple sources 2025-06-29 21:39:28 +02:00			`public static function extractData(string $html): array`
			`{`
			`return [`
			`'title' => self::extractTitle($html),`
			`'description' => self::extractDescription($html),`
			`'full_article' => self::extractFullArticle($html),`
Split sources, refine publisher 2025-06-30 18:18:30 +02:00			`'thumbnail' => self::extractThumbnail($html),`
Add multiple sources 2025-06-29 21:39:28 +02:00			`];`
			`}`
			`}`