([^<]+)<\/title>/i', $html, $matches)) { return html_entity_decode($matches[1], ENT_QUOTES, 'UTF-8'); } return null; } public static function extractDescription(string $html): ?string { // Try meta description first if (preg_match('/<meta property="og:description" content="([^"]+)"/i', $html, $matches)) { return html_entity_decode($matches[1], ENT_QUOTES, 'UTF-8'); } // Try to find first paragraph in article content if (preg_match('/<p[^>]*>([^<]+(?:<[^\/](?!p)[^>]*>[^<]*<\/[^>]*>[^<]*)*)<\/p>/i', $html, $matches)) { return html_entity_decode(strip_tags($matches[1]), ENT_QUOTES, 'UTF-8'); } return null; } public static function extractData(string $html): array { return [ 'title' => self::extractTitle($html), 'description' => self::extractDescription($html), 'full_article' => self::extractFullArticle($html), ]; } public static function extractFullArticle(string $html): ?string { // Remove scripts, styles, and other non-content elements $cleanHtml = preg_replace('/<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/mi', '', $html); $cleanHtml = preg_replace('/<style\b[^<]*(?:(?!<\/style>)<[^<]*)*<\/style>/mi', '', $cleanHtml); // Extract all paragraph content preg_match_all('/<p[^>]*>(.*?)<\/p>/is', $cleanHtml, $matches); if (!empty($matches[1])) { $paragraphs = array_map(function($paragraph) { return html_entity_decode(strip_tags($paragraph), ENT_QUOTES, 'UTF-8'); }, $matches[1]); // Filter out empty paragraphs and join with double newlines $fullText = implode("\n\n", array_filter($paragraphs, function($p) { return trim($p) !== ''; })); return $fullText ?: null; } return null; } }

]*>([^<]+)<\/h1>/i', $html, $matches)) { return html_entity_decode(strip_tags($matches[1]), ENT_QUOTES, 'UTF-8'); } // Try title tag if (preg_match('/([^<]+)<\/title>/i', $html, $matches)) { return html_entity_decode($matches[1], ENT_QUOTES, 'UTF-8'); } return null; } public static function extractDescription(string $html): ?string { // Try meta description first if (preg_match('/<meta property="og:description" content="([^"]+)"/i', $html, $matches)) { return html_entity_decode($matches[1], ENT_QUOTES, 'UTF-8'); } // Try to find first paragraph in article content if (preg_match('/<p[^>]*>([^<]+(?:<[^\/](?!p)[^>]*>[^<]*<\/[^>]*>[^<]*)*)<\/p>/i', $html, $matches)) { return html_entity_decode(strip_tags($matches[1]), ENT_QUOTES, 'UTF-8'); } return null; } public static function extractData(string $html): array { return [ 'title' => self::extractTitle($html), 'description' => self::extractDescription($html), 'full_article' => self::extractFullArticle($html), ]; } public static function extractFullArticle(string $html): ?string { // Remove scripts, styles, and other non-content elements $cleanHtml = preg_replace('/<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/mi', '', $html); $cleanHtml = preg_replace('/<style\b[^<]*(?:(?!<\/style>)<[^<]*)*<\/style>/mi', '', $cleanHtml); // Extract all paragraph content preg_match_all('/<p[^>]*>(.*?)<\/p>/is', $cleanHtml, $matches); if (!empty($matches[1])) { $paragraphs = array_map(function($paragraph) { return html_entity_decode(strip_tags($paragraph), ENT_QUOTES, 'UTF-8'); }, $matches[1]); // Filter out empty paragraphs and join with double newlines $fullText = implode("\n\n", array_filter($paragraphs, function($p) { return trim($p) !== ''; })); return $fullText ?: null; } return null; } }