Code Coverage for /var/www/html/backend/app/Services/Parsers/VrtArticlePageParser.php

	Code Coverage
	Lines			Functions and Methods				Classes and Traits
Total	0.00% covered (danger)	0.00%	0 / 35	0.00% covered (danger)	0.00%	0 / 5	CRAP	0.00% covered (danger)	0.00%	0 / 1
VrtArticlePageParser	0.00% covered (danger)	0.00%	0 / 35	0.00% covered (danger)	0.00%	0 / 5	210	0.00% covered (danger)	0.00%	0 / 1
extractTitle	0.00% covered (danger)	0.00%	0 / 7	0.00% covered (danger)	0.00%	0 / 1	20
extractDescription	0.00% covered (danger)	0.00%	0 / 5	0.00% covered (danger)	0.00%	0 / 1	12
extractFullArticle	0.00% covered (danger)	0.00%	0 / 12	0.00% covered (danger)	0.00%	0 / 1	12
extractThumbnail	0.00% covered (danger)	0.00%	0 / 5	0.00% covered (danger)	0.00%	0 / 1	12
extractData	0.00% covered (danger)	0.00%	0 / 6	0.00% covered (danger)	0.00%	0 / 1	2

1	<?php
2
3	namespace App\Services\Parsers;
4
5	class VrtArticlePageParser
6	{
7	public static function extractTitle(string $html): ?string
8	{
9	// Try meta title first
10	if (preg_match('/<meta property="og:title" content="([^"]+)"/i', $html, $matches)) {
11	return html_entity_decode($matches[1], ENT_QUOTES, 'UTF-8');
12	}
13
14	// Try h1 tag
15	if (preg_match('/<h1[^>]*>([^<]+)<\/h1>/i', $html, $matches)) {
16	return html_entity_decode(strip_tags($matches[1]), ENT_QUOTES, 'UTF-8');
17	}
18
19	// Try title tag
20	if (preg_match('/<title>([^<]+)<\/title>/i', $html, $matches)) {
21	return html_entity_decode($matches[1], ENT_QUOTES, 'UTF-8');
22	}
23
24	return null;
25	}
26
27	public static function extractDescription(string $html): ?string
28	{
29	// Try meta description first
30	if (preg_match('/<meta property="og:description" content="([^"]+)"/i', $html, $matches)) {
31	return html_entity_decode($matches[1], ENT_QUOTES, 'UTF-8');
32	}
33
34	// Try to find first paragraph in article content
35	if (preg_match('/<p[^>]>([^<]+(?:<[^\/](?!p)[^>]>[^<]<\/[^>]>[^<]))<\/p>/i', $html, $matches)) {
36	return html_entity_decode(strip_tags($matches[1]), ENT_QUOTES, 'UTF-8');
37	}
38
39	return null;
40	}
41
42	public static function extractFullArticle(string $html): ?string
43	{
44	// Remove scripts, styles, and other non-content elements
45	$cleanHtml = preg_replace('/<script\b[^<](?:(?!<\/script>)<[^<])*<\/script>/mi', '', $html);
46	$cleanHtml = preg_replace('/<style\b[^<](?:(?!<\/style>)<[^<])*<\/style>/mi', '', $cleanHtml);
47
48	// Extract all paragraph content
49	preg_match_all('/<p[^>]>(.?)<\/p>/is', $cleanHtml, $matches);
50
51	if (!empty($matches[1])) {
52	$paragraphs = array_map(function($paragraph) {
53	return html_entity_decode(strip_tags($paragraph), ENT_QUOTES, 'UTF-8');
54	}, $matches[1]);
55
56	// Filter out empty paragraphs and join with double newlines
57	$fullText = implode("\n\n", array_filter($paragraphs, function($p) {
58	return trim($p) !== '';
59	}));
60
61	return $fullText ?: null;
62	}
63
64	return null;
65	}
66
67	public static function extractThumbnail(string $html): ?string
68	{
69	if (preg_match('/<meta property="og:image" content="([^"]+)"/i', $html, $matches)) {
70	return $matches[1];
71	}
72
73	if (preg_match('/<img[^>]+src="([^"]+)"/i', $html, $matches)) {
74	return $matches[1];
75	}
76
77	return null;
78	}
79
80	/**
81	* @return array<string, string\|null>
82	*/
83	public static function extractData(string $html): array
84	{
85	return [
86	'title' => self::extractTitle($html),
87	'description' => self::extractDescription($html),
88	'full_article' => self::extractFullArticle($html),
89	'thumbnail' => self::extractThumbnail($html),
90	];
91	}
92	}