Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
0.00% covered (danger)
0.00%
0 / 35
0.00% covered (danger)
0.00%
0 / 5
CRAP
0.00% covered (danger)
0.00%
0 / 1
VrtArticlePageParser
0.00% covered (danger)
0.00%
0 / 35
0.00% covered (danger)
0.00%
0 / 5
210
0.00% covered (danger)
0.00%
0 / 1
 extractTitle
0.00% covered (danger)
0.00%
0 / 7
0.00% covered (danger)
0.00%
0 / 1
20
 extractDescription
0.00% covered (danger)
0.00%
0 / 5
0.00% covered (danger)
0.00%
0 / 1
12
 extractFullArticle
0.00% covered (danger)
0.00%
0 / 12
0.00% covered (danger)
0.00%
0 / 1
12
 extractThumbnail
0.00% covered (danger)
0.00%
0 / 5
0.00% covered (danger)
0.00%
0 / 1
12
 extractData
0.00% covered (danger)
0.00%
0 / 6
0.00% covered (danger)
0.00%
0 / 1
2
1<?php
2
3namespace App\Services\Parsers;
4
5class VrtArticlePageParser
6{
7    public static function extractTitle(string $html): ?string
8    {
9        // Try meta title first
10        if (preg_match('/<meta property="og:title" content="([^"]+)"/i', $html, $matches)) {
11            return html_entity_decode($matches[1], ENT_QUOTES, 'UTF-8');
12        }
13
14        // Try h1 tag
15        if (preg_match('/<h1[^>]*>([^<]+)<\/h1>/i', $html, $matches)) {
16            return html_entity_decode(strip_tags($matches[1]), ENT_QUOTES, 'UTF-8');
17        }
18
19        // Try title tag
20        if (preg_match('/<title>([^<]+)<\/title>/i', $html, $matches)) {
21            return html_entity_decode($matches[1], ENT_QUOTES, 'UTF-8');
22        }
23
24        return null;
25    }
26
27    public static function extractDescription(string $html): ?string
28    {
29        // Try meta description first
30        if (preg_match('/<meta property="og:description" content="([^"]+)"/i', $html, $matches)) {
31            return html_entity_decode($matches[1], ENT_QUOTES, 'UTF-8');
32        }
33
34        // Try to find first paragraph in article content
35        if (preg_match('/<p[^>]*>([^<]+(?:<[^\/](?!p)[^>]*>[^<]*<\/[^>]*>[^<]*)*)<\/p>/i', $html, $matches)) {
36            return html_entity_decode(strip_tags($matches[1]), ENT_QUOTES, 'UTF-8');
37        }
38
39        return null;
40    }
41
42    public static function extractFullArticle(string $html): ?string
43    {
44        // Remove scripts, styles, and other non-content elements
45        $cleanHtml = preg_replace('/<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/mi', '', $html);
46        $cleanHtml = preg_replace('/<style\b[^<]*(?:(?!<\/style>)<[^<]*)*<\/style>/mi', '', $cleanHtml);
47
48        // Extract all paragraph content
49        preg_match_all('/<p[^>]*>(.*?)<\/p>/is', $cleanHtml, $matches);
50
51        if (!empty($matches[1])) {
52            $paragraphs = array_map(function($paragraph) {
53                return html_entity_decode(strip_tags($paragraph), ENT_QUOTES, 'UTF-8');
54            }, $matches[1]);
55
56            // Filter out empty paragraphs and join with double newlines
57            $fullText = implode("\n\n", array_filter($paragraphs, function($p) {
58                return trim($p) !== '';
59            }));
60
61            return $fullText ?: null;
62        }
63
64        return null;
65    }
66
67    public static function extractThumbnail(string $html): ?string
68    {
69        if (preg_match('/<meta property="og:image" content="([^"]+)"/i', $html, $matches)) {
70            return $matches[1];
71        }
72
73        if (preg_match('/<img[^>]+src="([^"]+)"/i', $html, $matches)) {
74            return $matches[1];
75        }
76
77        return null;
78    }
79
80    /**
81     * @return array<string, string|null>
82     */
83    public static function extractData(string $html): array
84    {
85        return [
86            'title' => self::extractTitle($html),
87            'description' => self::extractDescription($html),
88            'full_article' => self::extractFullArticle($html),
89            'thumbnail' => self::extractThumbnail($html),
90        ];
91    }
92}