Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
| Total | |
0.00% |
0 / 35 |
|
0.00% |
0 / 5 |
CRAP | |
0.00% |
0 / 1 |
| VrtArticlePageParser | |
0.00% |
0 / 35 |
|
0.00% |
0 / 5 |
210 | |
0.00% |
0 / 1 |
| extractTitle | |
0.00% |
0 / 7 |
|
0.00% |
0 / 1 |
20 | |||
| extractDescription | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
12 | |||
| extractFullArticle | |
0.00% |
0 / 12 |
|
0.00% |
0 / 1 |
12 | |||
| extractThumbnail | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
12 | |||
| extractData | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
2 | |||
| 1 | <?php |
| 2 | |
| 3 | namespace App\Services\Parsers; |
| 4 | |
| 5 | class VrtArticlePageParser |
| 6 | { |
| 7 | public static function extractTitle(string $html): ?string |
| 8 | { |
| 9 | // Try meta title first |
| 10 | if (preg_match('/<meta property="og:title" content="([^"]+)"/i', $html, $matches)) { |
| 11 | return html_entity_decode($matches[1], ENT_QUOTES, 'UTF-8'); |
| 12 | } |
| 13 | |
| 14 | // Try h1 tag |
| 15 | if (preg_match('/<h1[^>]*>([^<]+)<\/h1>/i', $html, $matches)) { |
| 16 | return html_entity_decode(strip_tags($matches[1]), ENT_QUOTES, 'UTF-8'); |
| 17 | } |
| 18 | |
| 19 | // Try title tag |
| 20 | if (preg_match('/<title>([^<]+)<\/title>/i', $html, $matches)) { |
| 21 | return html_entity_decode($matches[1], ENT_QUOTES, 'UTF-8'); |
| 22 | } |
| 23 | |
| 24 | return null; |
| 25 | } |
| 26 | |
| 27 | public static function extractDescription(string $html): ?string |
| 28 | { |
| 29 | // Try meta description first |
| 30 | if (preg_match('/<meta property="og:description" content="([^"]+)"/i', $html, $matches)) { |
| 31 | return html_entity_decode($matches[1], ENT_QUOTES, 'UTF-8'); |
| 32 | } |
| 33 | |
| 34 | // Try to find first paragraph in article content |
| 35 | if (preg_match('/<p[^>]*>([^<]+(?:<[^\/](?!p)[^>]*>[^<]*<\/[^>]*>[^<]*)*)<\/p>/i', $html, $matches)) { |
| 36 | return html_entity_decode(strip_tags($matches[1]), ENT_QUOTES, 'UTF-8'); |
| 37 | } |
| 38 | |
| 39 | return null; |
| 40 | } |
| 41 | |
| 42 | public static function extractFullArticle(string $html): ?string |
| 43 | { |
| 44 | // Remove scripts, styles, and other non-content elements |
| 45 | $cleanHtml = preg_replace('/<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/mi', '', $html); |
| 46 | $cleanHtml = preg_replace('/<style\b[^<]*(?:(?!<\/style>)<[^<]*)*<\/style>/mi', '', $cleanHtml); |
| 47 | |
| 48 | // Extract all paragraph content |
| 49 | preg_match_all('/<p[^>]*>(.*?)<\/p>/is', $cleanHtml, $matches); |
| 50 | |
| 51 | if (!empty($matches[1])) { |
| 52 | $paragraphs = array_map(function($paragraph) { |
| 53 | return html_entity_decode(strip_tags($paragraph), ENT_QUOTES, 'UTF-8'); |
| 54 | }, $matches[1]); |
| 55 | |
| 56 | // Filter out empty paragraphs and join with double newlines |
| 57 | $fullText = implode("\n\n", array_filter($paragraphs, function($p) { |
| 58 | return trim($p) !== ''; |
| 59 | })); |
| 60 | |
| 61 | return $fullText ?: null; |
| 62 | } |
| 63 | |
| 64 | return null; |
| 65 | } |
| 66 | |
| 67 | public static function extractThumbnail(string $html): ?string |
| 68 | { |
| 69 | if (preg_match('/<meta property="og:image" content="([^"]+)"/i', $html, $matches)) { |
| 70 | return $matches[1]; |
| 71 | } |
| 72 | |
| 73 | if (preg_match('/<img[^>]+src="([^"]+)"/i', $html, $matches)) { |
| 74 | return $matches[1]; |
| 75 | } |
| 76 | |
| 77 | return null; |
| 78 | } |
| 79 | |
| 80 | /** |
| 81 | * @return array<string, string|null> |
| 82 | */ |
| 83 | public static function extractData(string $html): array |
| 84 | { |
| 85 | return [ |
| 86 | 'title' => self::extractTitle($html), |
| 87 | 'description' => self::extractDescription($html), |
| 88 | 'full_article' => self::extractFullArticle($html), |
| 89 | 'thumbnail' => self::extractThumbnail($html), |
| 90 | ]; |
| 91 | } |
| 92 | } |