Add multiple sources
This commit is contained in:
parent
788d649276
commit
83194cd64b
6 changed files with 164 additions and 0 deletions
|
|
@ -4,12 +4,14 @@
|
||||||
|
|
||||||
use App\Contracts\ArticleParserInterface;
|
use App\Contracts\ArticleParserInterface;
|
||||||
use App\Services\Parsers\VrtArticleParser;
|
use App\Services\Parsers\VrtArticleParser;
|
||||||
|
use App\Services\Parsers\BelgaArticleParser;
|
||||||
use Exception;
|
use Exception;
|
||||||
|
|
||||||
class ArticleParserFactory
|
class ArticleParserFactory
|
||||||
{
|
{
|
||||||
private static array $parsers = [
|
private static array $parsers = [
|
||||||
VrtArticleParser::class,
|
VrtArticleParser::class,
|
||||||
|
BelgaArticleParser::class,
|
||||||
];
|
];
|
||||||
|
|
||||||
public static function getParser(string $url): ArticleParserInterface
|
public static function getParser(string $url): ArticleParserInterface
|
||||||
|
|
|
||||||
|
|
@ -4,12 +4,14 @@
|
||||||
|
|
||||||
use App\Contracts\HomepageParserInterface;
|
use App\Contracts\HomepageParserInterface;
|
||||||
use App\Services\Parsers\VrtHomepageParserAdapter;
|
use App\Services\Parsers\VrtHomepageParserAdapter;
|
||||||
|
use App\Services\Parsers\BelgaHomepageParserAdapter;
|
||||||
use Exception;
|
use Exception;
|
||||||
|
|
||||||
class HomepageParserFactory
|
class HomepageParserFactory
|
||||||
{
|
{
|
||||||
private static array $parsers = [
|
private static array $parsers = [
|
||||||
VrtHomepageParserAdapter::class,
|
VrtHomepageParserAdapter::class,
|
||||||
|
BelgaHomepageParserAdapter::class,
|
||||||
];
|
];
|
||||||
|
|
||||||
public static function getParser(string $url): HomepageParserInterface
|
public static function getParser(string $url): HomepageParserInterface
|
||||||
|
|
|
||||||
91
app/Services/Parsers/BelgaArticlePageParser.php
Normal file
91
app/Services/Parsers/BelgaArticlePageParser.php
Normal file
|
|
@ -0,0 +1,91 @@
|
||||||
|
<?php
|
||||||
|
|
||||||
|
namespace App\Services\Parsers;
|
||||||
|
|
||||||
|
class BelgaArticlePageParser
|
||||||
|
{
|
||||||
|
public static function extractTitle(string $html): ?string
|
||||||
|
{
|
||||||
|
// Try h1 with Belga-specific class first
|
||||||
|
if (preg_match('/<h1[^>]*class="[^"]*prezly-slate-heading--heading-1[^"]*"[^>]*>([^<]+)<\/h1>/i', $html, $matches)) {
|
||||||
|
return html_entity_decode(strip_tags($matches[1]), ENT_QUOTES, 'UTF-8');
|
||||||
|
}
|
||||||
|
|
||||||
|
// Try meta title
|
||||||
|
if (preg_match('/<meta property="og:title" content="([^"]+)"/i', $html, $matches)) {
|
||||||
|
return html_entity_decode($matches[1], ENT_QUOTES, 'UTF-8');
|
||||||
|
}
|
||||||
|
|
||||||
|
// Try any h1 tag
|
||||||
|
if (preg_match('/<h1[^>]*>([^<]+)<\/h1>/i', $html, $matches)) {
|
||||||
|
return html_entity_decode(strip_tags($matches[1]), ENT_QUOTES, 'UTF-8');
|
||||||
|
}
|
||||||
|
|
||||||
|
// Try title tag
|
||||||
|
if (preg_match('/<title>([^<]+)<\/title>/i', $html, $matches)) {
|
||||||
|
return html_entity_decode($matches[1], ENT_QUOTES, 'UTF-8');
|
||||||
|
}
|
||||||
|
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static function extractDescription(string $html): ?string
|
||||||
|
{
|
||||||
|
// Try meta description first
|
||||||
|
if (preg_match('/<meta property="og:description" content="([^"]+)"/i', $html, $matches)) {
|
||||||
|
return html_entity_decode($matches[1], ENT_QUOTES, 'UTF-8');
|
||||||
|
}
|
||||||
|
|
||||||
|
// Try Belga-specific paragraph class
|
||||||
|
if (preg_match('/<p[^>]*class="[^"]*styles_paragraph__[^"]*"[^>]*>([^<]+(?:<[^\/](?!p)[^>]*>[^<]*<\/[^>]*>[^<]*)*)<\/p>/i', $html, $matches)) {
|
||||||
|
return html_entity_decode(strip_tags($matches[1]), ENT_QUOTES, 'UTF-8');
|
||||||
|
}
|
||||||
|
|
||||||
|
// Try to find first paragraph in article content
|
||||||
|
if (preg_match('/<p[^>]*>([^<]+(?:<[^\/](?!p)[^>]*>[^<]*<\/[^>]*>[^<]*)*)<\/p>/i', $html, $matches)) {
|
||||||
|
return html_entity_decode(strip_tags($matches[1]), ENT_QUOTES, 'UTF-8');
|
||||||
|
}
|
||||||
|
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static function extractFullArticle(string $html): ?string
|
||||||
|
{
|
||||||
|
// Remove scripts, styles, and other non-content elements
|
||||||
|
$cleanHtml = preg_replace('/<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/mi', '', $html);
|
||||||
|
$cleanHtml = preg_replace('/<style\b[^<]*(?:(?!<\/style>)<[^<]*)*<\/style>/mi', '', $cleanHtml);
|
||||||
|
|
||||||
|
// Try to extract content from Belga-specific document section
|
||||||
|
if (preg_match('/<section[^>]*class="[^"]*prezly-slate-document[^"]*"[^>]*>(.*?)<\/section>/is', $cleanHtml, $sectionMatches)) {
|
||||||
|
$sectionHtml = $sectionMatches[1];
|
||||||
|
preg_match_all('/<p[^>]*>(.*?)<\/p>/is', $sectionHtml, $matches);
|
||||||
|
} else {
|
||||||
|
// Fallback: Extract all paragraph content
|
||||||
|
preg_match_all('/<p[^>]*>(.*?)<\/p>/is', $cleanHtml, $matches);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!empty($matches[1])) {
|
||||||
|
$paragraphs = array_map(function($paragraph) {
|
||||||
|
return html_entity_decode(strip_tags($paragraph), ENT_QUOTES, 'UTF-8');
|
||||||
|
}, $matches[1]);
|
||||||
|
|
||||||
|
// Filter out empty paragraphs and join with double newlines
|
||||||
|
$fullText = implode("\n\n", array_filter($paragraphs, function($p) {
|
||||||
|
return trim($p) !== '';
|
||||||
|
}));
|
||||||
|
|
||||||
|
return $fullText ?: null;
|
||||||
|
}
|
||||||
|
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static function extractData(string $html): array
|
||||||
|
{
|
||||||
|
return [
|
||||||
|
'title' => self::extractTitle($html),
|
||||||
|
'description' => self::extractDescription($html),
|
||||||
|
'full_article' => self::extractFullArticle($html),
|
||||||
|
];
|
||||||
|
}
|
||||||
|
}
|
||||||
23
app/Services/Parsers/BelgaArticleParser.php
Normal file
23
app/Services/Parsers/BelgaArticleParser.php
Normal file
|
|
@ -0,0 +1,23 @@
|
||||||
|
<?php
|
||||||
|
|
||||||
|
namespace App\Services\Parsers;
|
||||||
|
|
||||||
|
use App\Contracts\ArticleParserInterface;
|
||||||
|
|
||||||
|
class BelgaArticleParser implements ArticleParserInterface
|
||||||
|
{
|
||||||
|
public function canParse(string $url): bool
|
||||||
|
{
|
||||||
|
return str_contains($url, 'belganewsagency.eu');
|
||||||
|
}
|
||||||
|
|
||||||
|
public function extractData(string $html): array
|
||||||
|
{
|
||||||
|
return BelgaArticlePageParser::extractData($html);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function getSourceName(): string
|
||||||
|
{
|
||||||
|
return 'Belga News Agency';
|
||||||
|
}
|
||||||
|
}
|
||||||
18
app/Services/Parsers/BelgaHomepageParser.php
Normal file
18
app/Services/Parsers/BelgaHomepageParser.php
Normal file
|
|
@ -0,0 +1,18 @@
|
||||||
|
<?php
|
||||||
|
|
||||||
|
namespace App\Services\Parsers;
|
||||||
|
|
||||||
|
class BelgaHomepageParser
|
||||||
|
{
|
||||||
|
public static function extractArticleUrls(string $html): array
|
||||||
|
{
|
||||||
|
preg_match_all('/href="https:\/\/www\.belganewsagency\.eu\/([a-z0-9-]+)"/', $html, $matches);
|
||||||
|
|
||||||
|
$urls = collect($matches[0] ?? [])
|
||||||
|
->unique()
|
||||||
|
->map(fn ($url) => str_replace('href="', '', str_replace('"', '', $url)))
|
||||||
|
->toArray();
|
||||||
|
|
||||||
|
return $urls;
|
||||||
|
}
|
||||||
|
}
|
||||||
28
app/Services/Parsers/BelgaHomepageParserAdapter.php
Normal file
28
app/Services/Parsers/BelgaHomepageParserAdapter.php
Normal file
|
|
@ -0,0 +1,28 @@
|
||||||
|
<?php
|
||||||
|
|
||||||
|
namespace App\Services\Parsers;
|
||||||
|
|
||||||
|
use App\Contracts\HomepageParserInterface;
|
||||||
|
|
||||||
|
class BelgaHomepageParserAdapter implements HomepageParserInterface
|
||||||
|
{
|
||||||
|
public function canParse(string $url): bool
|
||||||
|
{
|
||||||
|
return str_contains($url, 'belganewsagency.eu');
|
||||||
|
}
|
||||||
|
|
||||||
|
public function extractArticleUrls(string $html): array
|
||||||
|
{
|
||||||
|
return BelgaHomepageParser::extractArticleUrls($html);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function getHomepageUrl(): string
|
||||||
|
{
|
||||||
|
return 'https://www.belganewsagency.eu/';
|
||||||
|
}
|
||||||
|
|
||||||
|
public function getSourceName(): string
|
||||||
|
{
|
||||||
|
return 'Belga News Agency';
|
||||||
|
}
|
||||||
|
}
|
||||||
Loading…
Reference in a new issue