fedi-feed-router/app/Services/Parsers/GuardianArticlePageParser.php

<?php

namespace App\Services\Parsers;

class GuardianArticlePageParser
{
    public static function extractTitle(string $html): ?string
    {
        // Try meta title first
        if (preg_match('/<meta property="og:title" content="([^"]+)"/i', $html, $matches)) {
            return html_entity_decode($matches[1], ENT_QUOTES, 'UTF-8');
        }

        // Try any h1 tag
        if (preg_match('/<h1[^>]*>([^<]+)<\/h1>/i', $html, $matches)) {
            return html_entity_decode(strip_tags($matches[1]), ENT_QUOTES, 'UTF-8');
        }

        // Try title tag
        if (preg_match('/<title>([^<]+)<\/title>/i', $html, $matches)) {
            return html_entity_decode($matches[1], ENT_QUOTES, 'UTF-8');
        }

        return null;
    }

    public static function extractDescription(string $html): ?string
    {
        // Try meta description first
        if (preg_match('/<meta property="og:description" content="([^"]+)"/i', $html, $matches)) {
            return html_entity_decode($matches[1], ENT_QUOTES, 'UTF-8');
        }

        // Try first paragraph
        if (preg_match('/<p[^>]*>([^<]+(?:<[^\/](?!p)[^>]*>[^<]*<\/[^>]*>[^<]*)*)<\/p>/i', $html, $matches)) {
            return html_entity_decode(strip_tags($matches[1]), ENT_QUOTES, 'UTF-8');
        }

        return null;
    }

    public static function extractFullArticle(string $html): ?string
    {
        // Remove scripts, styles, and other non-content elements
        $cleanHtml = preg_replace('/<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/mi', '', $html);
        $cleanHtml = preg_replace('/<style\b[^<]*(?:(?!<\/style>)<[^<]*)*<\/style>/mi', '', $cleanHtml);

        // Try Guardian-specific article body container (greedy to avoid stopping at nested divs)
        if (preg_match('/<div[^>]*class="[^"]*article-body-commercial-selector[^"]*"[^>]*>(.*)<\/div>/is', $cleanHtml, $sectionMatches)) {
            $sectionHtml = $sectionMatches[1];
            preg_match_all('/<p[^>]*>(.*?)<\/p>/is', $sectionHtml, $matches);

            if (! empty($matches[1])) {
                return self::joinParagraphs($matches[1]);
            }
        }

        // Fallback: extract all paragraph content
        preg_match_all('/<p[^>]*>(.*?)<\/p>/is', $cleanHtml, $matches);
        if (! empty($matches[1])) {
            return self::joinParagraphs($matches[1]);
        }

        return null;
    }

    public static function extractThumbnail(string $html): ?string
    {
        // Try OpenGraph image first
        if (preg_match('/<meta property="og:image" content="([^"]+)"/i', $html, $matches)) {
            return $matches[1];
        }

        // Try first image in content
        if (preg_match('/<img[^>]+src="([^"]+)"/i', $html, $matches)) {
            return $matches[1];
        }

        return null;
    }

    /**
     * @return array<string, string|null>
     */
    public static function extractData(string $html): array
    {
        return [
            'title' => self::extractTitle($html),
            'description' => self::extractDescription($html),
            'full_article' => self::extractFullArticle($html),
            'thumbnail' => self::extractThumbnail($html),
        ];
    }

    /**
     * @param  array<int, string>  $paragraphs
     */
    private static function joinParagraphs(array $paragraphs): ?string
    {
        $paragraphs = array_map(function ($paragraph) {
            return html_entity_decode(strip_tags($paragraph), ENT_QUOTES, 'UTF-8');
        }, $paragraphs);

        $fullText = implode("\n\n", array_filter($paragraphs, function ($p) {
            return trim($p) !== '';
        }));

        return $fullText ?: null;
    }
}