92 lines
3 KiB
PHP
92 lines
3 KiB
PHP
<?php
|
|
|
|
namespace Domains\Article\Parsers\Vrt;
|
|
|
|
class VrtArticlePageParser
|
|
{
|
|
public static function extractTitle(string $html): ?string
|
|
{
|
|
// Try meta title first
|
|
if (preg_match('/<meta property="og:title" content="([^"]+)"/i', $html, $matches)) {
|
|
return html_entity_decode($matches[1], ENT_QUOTES, 'UTF-8');
|
|
}
|
|
|
|
// Try h1 tag
|
|
if (preg_match('/<h1[^>]*>([^<]+)<\/h1>/i', $html, $matches)) {
|
|
return html_entity_decode(strip_tags($matches[1]), ENT_QUOTES, 'UTF-8');
|
|
}
|
|
|
|
// Try title tag
|
|
if (preg_match('/<title>([^<]+)<\/title>/i', $html, $matches)) {
|
|
return html_entity_decode($matches[1], ENT_QUOTES, 'UTF-8');
|
|
}
|
|
|
|
return null;
|
|
}
|
|
|
|
public static function extractDescription(string $html): ?string
|
|
{
|
|
// Try meta description first
|
|
if (preg_match('/<meta property="og:description" content="([^"]+)"/i', $html, $matches)) {
|
|
return html_entity_decode($matches[1], ENT_QUOTES, 'UTF-8');
|
|
}
|
|
|
|
// Try to find first paragraph in article content
|
|
if (preg_match('/<p[^>]*>([^<]+(?:<[^\/](?!p)[^>]*>[^<]*<\/[^>]*>[^<]*)*)<\/p>/i', $html, $matches)) {
|
|
return html_entity_decode(strip_tags($matches[1]), ENT_QUOTES, 'UTF-8');
|
|
}
|
|
|
|
return null;
|
|
}
|
|
|
|
public static function extractFullArticle(string $html): ?string
|
|
{
|
|
// Remove scripts, styles, and other non-content elements
|
|
$cleanHtml = preg_replace('/<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/mi', '', $html);
|
|
$cleanHtml = preg_replace('/<style\b[^<]*(?:(?!<\/style>)<[^<]*)*<\/style>/mi', '', $cleanHtml);
|
|
|
|
// Extract all paragraph content
|
|
preg_match_all('/<p[^>]*>(.*?)<\/p>/is', $cleanHtml, $matches);
|
|
|
|
if (!empty($matches[1])) {
|
|
$paragraphs = array_map(function($paragraph) {
|
|
return html_entity_decode(strip_tags($paragraph), ENT_QUOTES, 'UTF-8');
|
|
}, $matches[1]);
|
|
|
|
// Filter out empty paragraphs and join with double newlines
|
|
$fullText = implode("\n\n", array_filter($paragraphs, function($p) {
|
|
return trim($p) !== '';
|
|
}));
|
|
|
|
return $fullText ?: null;
|
|
}
|
|
|
|
return null;
|
|
}
|
|
|
|
public static function extractThumbnail(string $html): ?string
|
|
{
|
|
if (preg_match('/<meta property="og:image" content="([^"]+)"/i', $html, $matches)) {
|
|
return $matches[1];
|
|
}
|
|
|
|
if (preg_match('/<img[^>]+src="([^"]+)"/i', $html, $matches)) {
|
|
return $matches[1];
|
|
}
|
|
|
|
return null;
|
|
}
|
|
|
|
/**
|
|
* @return array<string, string|null>
|
|
*/
|
|
public static function extractData(string $html): array
|
|
{
|
|
return [
|
|
'title' => self::extractTitle($html),
|
|
'description' => self::extractDescription($html),
|
|
'full_article' => self::extractFullArticle($html),
|
|
'thumbnail' => self::extractThumbnail($html),
|
|
];
|
|
}
|
|
}
|