fedi-feed-router/app/Services/Parsers/GuardianArticlePageParser.php
myrmidex 6784af2ff6
Some checks failed
CI / ci (push) Failing after 4m31s
25 - Fix all PHPStan errors and add mockery extension
2026-03-08 14:18:28 +01:00

110 lines
3.6 KiB
PHP

<?php
namespace App\Services\Parsers;
class GuardianArticlePageParser
{
public static function extractTitle(string $html): ?string
{
// Try meta title first
if (preg_match('/<meta property="og:title" content="([^"]+)"/i', $html, $matches)) {
return html_entity_decode($matches[1], ENT_QUOTES, 'UTF-8');
}
// Try any h1 tag
if (preg_match('/<h1[^>]*>([^<]+)<\/h1>/i', $html, $matches)) {
return html_entity_decode(strip_tags($matches[1]), ENT_QUOTES, 'UTF-8');
}
// Try title tag
if (preg_match('/<title>([^<]+)<\/title>/i', $html, $matches)) {
return html_entity_decode($matches[1], ENT_QUOTES, 'UTF-8');
}
return null;
}
public static function extractDescription(string $html): ?string
{
// Try meta description first
if (preg_match('/<meta property="og:description" content="([^"]+)"/i', $html, $matches)) {
return html_entity_decode($matches[1], ENT_QUOTES, 'UTF-8');
}
// Try first paragraph
if (preg_match('/<p[^>]*>([^<]+(?:<[^\/](?!p)[^>]*>[^<]*<\/[^>]*>[^<]*)*)<\/p>/i', $html, $matches)) {
return html_entity_decode(strip_tags($matches[1]), ENT_QUOTES, 'UTF-8');
}
return null;
}
public static function extractFullArticle(string $html): ?string
{
// Remove scripts, styles, and other non-content elements
$cleanHtml = preg_replace('/<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/mi', '', $html);
$cleanHtml = preg_replace('/<style\b[^<]*(?:(?!<\/style>)<[^<]*)*<\/style>/mi', '', $cleanHtml);
// Try Guardian-specific article body container (greedy to avoid stopping at nested divs)
if (preg_match('/<div[^>]*class="[^"]*article-body-commercial-selector[^"]*"[^>]*>(.*)<\/div>/is', $cleanHtml, $sectionMatches)) {
$sectionHtml = $sectionMatches[1];
preg_match_all('/<p[^>]*>(.*?)<\/p>/is', $sectionHtml, $matches);
if (! empty($matches[1])) {
return self::joinParagraphs($matches[1]);
}
}
// Fallback: extract all paragraph content
preg_match_all('/<p[^>]*>(.*?)<\/p>/is', $cleanHtml, $matches);
if (! empty($matches[1])) {
return self::joinParagraphs($matches[1]);
}
return null;
}
public static function extractThumbnail(string $html): ?string
{
// Try OpenGraph image first
if (preg_match('/<meta property="og:image" content="([^"]+)"/i', $html, $matches)) {
return $matches[1];
}
// Try first image in content
if (preg_match('/<img[^>]+src="([^"]+)"/i', $html, $matches)) {
return $matches[1];
}
return null;
}
/**
* @return array<string, string|null>
*/
public static function extractData(string $html): array
{
return [
'title' => self::extractTitle($html),
'description' => self::extractDescription($html),
'full_article' => self::extractFullArticle($html),
'thumbnail' => self::extractThumbnail($html),
];
}
/**
* @param array<int, string> $paragraphs
*/
private static function joinParagraphs(array $paragraphs): ?string
{
$paragraphs = array_map(function ($paragraph) {
return html_entity_decode(strip_tags($paragraph), ENT_QUOTES, 'UTF-8');
}, $paragraphs);
$fullText = implode("\n\n", array_filter($paragraphs, function ($p) {
return trim($p) !== '';
}));
return $fullText ?: null;
}
}