Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
0.00% covered (danger)
0.00%
0 / 42
0.00% covered (danger)
0.00%
0 / 5
CRAP
0.00% covered (danger)
0.00%
0 / 1
BelgaArticlePageParser
0.00% covered (danger)
0.00%
0 / 42
0.00% covered (danger)
0.00%
0 / 5
306
0.00% covered (danger)
0.00%
0 / 1
 extractTitle
0.00% covered (danger)
0.00%
0 / 9
0.00% covered (danger)
0.00%
0 / 1
30
 extractDescription
0.00% covered (danger)
0.00%
0 / 7
0.00% covered (danger)
0.00%
0 / 1
20
 extractFullArticle
0.00% covered (danger)
0.00%
0 / 15
0.00% covered (danger)
0.00%
0 / 1
20
 extractThumbnail
0.00% covered (danger)
0.00%
0 / 5
0.00% covered (danger)
0.00%
0 / 1
12
 extractData
0.00% covered (danger)
0.00%
0 / 6
0.00% covered (danger)
0.00%
0 / 1
2
1<?php
2
3namespace App\Services\Parsers;
4
5class BelgaArticlePageParser
6{
7    public static function extractTitle(string $html): ?string
8    {
9        // Try h1 with Belga-specific class first
10        if (preg_match('/<h1[^>]*class="[^"]*prezly-slate-heading--heading-1[^"]*"[^>]*>([^<]+)<\/h1>/i', $html, $matches)) {
11            return html_entity_decode(strip_tags($matches[1]), ENT_QUOTES, 'UTF-8');
12        }
13        
14        // Try meta title
15        if (preg_match('/<meta property="og:title" content="([^"]+)"/i', $html, $matches)) {
16            return html_entity_decode($matches[1], ENT_QUOTES, 'UTF-8');
17        }
18        
19        // Try any h1 tag
20        if (preg_match('/<h1[^>]*>([^<]+)<\/h1>/i', $html, $matches)) {
21            return html_entity_decode(strip_tags($matches[1]), ENT_QUOTES, 'UTF-8');
22        }
23        
24        // Try title tag
25        if (preg_match('/<title>([^<]+)<\/title>/i', $html, $matches)) {
26            return html_entity_decode($matches[1], ENT_QUOTES, 'UTF-8');
27        }
28        
29        return null;
30    }
31    
32    public static function extractDescription(string $html): ?string
33    {
34        // Try meta description first
35        if (preg_match('/<meta property="og:description" content="([^"]+)"/i', $html, $matches)) {
36            return html_entity_decode($matches[1], ENT_QUOTES, 'UTF-8');
37        }
38        
39        // Try Belga-specific paragraph class
40        if (preg_match('/<p[^>]*class="[^"]*styles_paragraph__[^"]*"[^>]*>([^<]+(?:<[^\/](?!p)[^>]*>[^<]*<\/[^>]*>[^<]*)*)<\/p>/i', $html, $matches)) {
41            return html_entity_decode(strip_tags($matches[1]), ENT_QUOTES, 'UTF-8');
42        }
43        
44        // Try to find first paragraph in article content
45        if (preg_match('/<p[^>]*>([^<]+(?:<[^\/](?!p)[^>]*>[^<]*<\/[^>]*>[^<]*)*)<\/p>/i', $html, $matches)) {
46            return html_entity_decode(strip_tags($matches[1]), ENT_QUOTES, 'UTF-8');
47        }
48        
49        return null;
50    }
51    
52    public static function extractFullArticle(string $html): ?string
53    {
54        // Remove scripts, styles, and other non-content elements
55        $cleanHtml = preg_replace('/<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/mi', '', $html);
56        $cleanHtml = preg_replace('/<style\b[^<]*(?:(?!<\/style>)<[^<]*)*<\/style>/mi', '', $cleanHtml);
57        
58        // Try to extract content from Belga-specific document section
59        if (preg_match('/<section[^>]*class="[^"]*prezly-slate-document[^"]*"[^>]*>(.*?)<\/section>/is', $cleanHtml, $sectionMatches)) {
60            $sectionHtml = $sectionMatches[1];
61            preg_match_all('/<p[^>]*>(.*?)<\/p>/is', $sectionHtml, $matches);
62        } else {
63            // Fallback: Extract all paragraph content
64            preg_match_all('/<p[^>]*>(.*?)<\/p>/is', $cleanHtml, $matches);
65        }
66        
67        if (!empty($matches[1])) {
68            $paragraphs = array_map(function($paragraph) {
69                return html_entity_decode(strip_tags($paragraph), ENT_QUOTES, 'UTF-8');
70            }, $matches[1]);
71            
72            // Filter out empty paragraphs and join with double newlines
73            $fullText = implode("\n\n", array_filter($paragraphs, function($p) {
74                return trim($p) !== '';
75            }));
76            
77            return $fullText ?: null;
78        }
79        
80        return null;
81    }
82    
83    public static function extractThumbnail(string $html): ?string
84    {
85        // Try OpenGraph image first
86        if (preg_match('/<meta property="og:image" content="([^"]+)"/i', $html, $matches)) {
87            return $matches[1];
88        }
89        
90        // Try first image in article content
91        if (preg_match('/<img[^>]+src="([^"]+)"/i', $html, $matches)) {
92            return $matches[1];
93        }
94        
95        return null;
96    }
97
98    /**
99     * @return array<string, string|null>
100     */
101    public static function extractData(string $html): array
102    {
103        return [
104            'title' => self::extractTitle($html),
105            'description' => self::extractDescription($html),
106            'full_article' => self::extractFullArticle($html),
107            'thumbnail' => self::extractThumbnail($html),
108        ];
109    }
110}