Code Coverage for /var/www/html/backend/app/Services/Parsers/BelgaArticlePageParser.php

	Code Coverage
	Lines			Functions and Methods				Classes and Traits
Total	0.00% covered (danger)	0.00%	0 / 42	0.00% covered (danger)	0.00%	0 / 5	CRAP	0.00% covered (danger)	0.00%	0 / 1
BelgaArticlePageParser	0.00% covered (danger)	0.00%	0 / 42	0.00% covered (danger)	0.00%	0 / 5	306	0.00% covered (danger)	0.00%	0 / 1
extractTitle	0.00% covered (danger)	0.00%	0 / 9	0.00% covered (danger)	0.00%	0 / 1	30
extractDescription	0.00% covered (danger)	0.00%	0 / 7	0.00% covered (danger)	0.00%	0 / 1	20
extractFullArticle	0.00% covered (danger)	0.00%	0 / 15	0.00% covered (danger)	0.00%	0 / 1	20
extractThumbnail	0.00% covered (danger)	0.00%	0 / 5	0.00% covered (danger)	0.00%	0 / 1	12
extractData	0.00% covered (danger)	0.00%	0 / 6	0.00% covered (danger)	0.00%	0 / 1	2

1	<?php
2
3	namespace App\Services\Parsers;
4
5	class BelgaArticlePageParser
6	{
7	public static function extractTitle(string $html): ?string
8	{
9	// Try h1 with Belga-specific class first
10	if (preg_match('/<h1[^>]class="[^"]prezly-slate-heading--heading-1[^"]"[^>]>([^<]+)<\/h1>/i', $html, $matches)) {
11	return html_entity_decode(strip_tags($matches[1]), ENT_QUOTES, 'UTF-8');
12	}
13
14	// Try meta title
15	if (preg_match('/<meta property="og:title" content="([^"]+)"/i', $html, $matches)) {
16	return html_entity_decode($matches[1], ENT_QUOTES, 'UTF-8');
17	}
18
19	// Try any h1 tag
20	if (preg_match('/<h1[^>]*>([^<]+)<\/h1>/i', $html, $matches)) {
21	return html_entity_decode(strip_tags($matches[1]), ENT_QUOTES, 'UTF-8');
22	}
23
24	// Try title tag
25	if (preg_match('/<title>([^<]+)<\/title>/i', $html, $matches)) {
26	return html_entity_decode($matches[1], ENT_QUOTES, 'UTF-8');
27	}
28
29	return null;
30	}
31
32	public static function extractDescription(string $html): ?string
33	{
34	// Try meta description first
35	if (preg_match('/<meta property="og:description" content="([^"]+)"/i', $html, $matches)) {
36	return html_entity_decode($matches[1], ENT_QUOTES, 'UTF-8');
37	}
38
39	// Try Belga-specific paragraph class
40	if (preg_match('/<p[^>]class="[^"]styles_paragraph__[^"]"[^>]>([^<]+(?:<[^\/](?!p)[^>]>[^<]<\/[^>]>[^<])*)<\/p>/i', $html, $matches)) {
41	return html_entity_decode(strip_tags($matches[1]), ENT_QUOTES, 'UTF-8');
42	}
43
44	// Try to find first paragraph in article content
45	if (preg_match('/<p[^>]>([^<]+(?:<[^\/](?!p)[^>]>[^<]<\/[^>]>[^<]))<\/p>/i', $html, $matches)) {
46	return html_entity_decode(strip_tags($matches[1]), ENT_QUOTES, 'UTF-8');
47	}
48
49	return null;
50	}
51
52	public static function extractFullArticle(string $html): ?string
53	{
54	// Remove scripts, styles, and other non-content elements
55	$cleanHtml = preg_replace('/<script\b[^<](?:(?!<\/script>)<[^<])*<\/script>/mi', '', $html);
56	$cleanHtml = preg_replace('/<style\b[^<](?:(?!<\/style>)<[^<])*<\/style>/mi', '', $cleanHtml);
57
58	// Try to extract content from Belga-specific document section
59	if (preg_match('/<section[^>]class="[^"]prezly-slate-document[^"]"[^>]>(.*?)<\/section>/is', $cleanHtml, $sectionMatches)) {
60	$sectionHtml = $sectionMatches[1];
61	preg_match_all('/<p[^>]>(.?)<\/p>/is', $sectionHtml, $matches);
62	} else {
63	// Fallback: Extract all paragraph content
64	preg_match_all('/<p[^>]>(.?)<\/p>/is', $cleanHtml, $matches);
65	}
66
67	if (!empty($matches[1])) {
68	$paragraphs = array_map(function($paragraph) {
69	return html_entity_decode(strip_tags($paragraph), ENT_QUOTES, 'UTF-8');
70	}, $matches[1]);
71
72	// Filter out empty paragraphs and join with double newlines
73	$fullText = implode("\n\n", array_filter($paragraphs, function($p) {
74	return trim($p) !== '';
75	}));
76
77	return $fullText ?: null;
78	}
79
80	return null;
81	}
82
83	public static function extractThumbnail(string $html): ?string
84	{
85	// Try OpenGraph image first
86	if (preg_match('/<meta property="og:image" content="([^"]+)"/i', $html, $matches)) {
87	return $matches[1];
88	}
89
90	// Try first image in article content
91	if (preg_match('/<img[^>]+src="([^"]+)"/i', $html, $matches)) {
92	return $matches[1];
93	}
94
95	return null;
96	}
97
98	/**
99	* @return array<string, string\|null>
100	*/
101	public static function extractData(string $html): array
102	{
103	return [
104	'title' => self::extractTitle($html),
105	'description' => self::extractDescription($html),
106	'full_article' => self::extractFullArticle($html),
107	'thumbnail' => self::extractThumbnail($html),
108	];
109	}
110	}