Fix article validation
This commit is contained in:
parent
3b810c0ffd
commit
5c666e62af
4 changed files with 571 additions and 9 deletions
|
|
@ -12,7 +12,7 @@ public static function validate(Article $article): Article
|
||||||
|
|
||||||
$articleData = ArticleFetcher::fetchArticleData($article);
|
$articleData = ArticleFetcher::fetchArticleData($article);
|
||||||
|
|
||||||
// Update article with fetched data (title, description, etc.)
|
// Update article with fetched metadata (title, description)
|
||||||
$updateData = [
|
$updateData = [
|
||||||
'validated_at' => now(),
|
'validated_at' => now(),
|
||||||
];
|
];
|
||||||
|
|
@ -20,12 +20,10 @@ public static function validate(Article $article): Article
|
||||||
if (!empty($articleData)) {
|
if (!empty($articleData)) {
|
||||||
$updateData['title'] = $articleData['title'] ?? null;
|
$updateData['title'] = $articleData['title'] ?? null;
|
||||||
$updateData['description'] = $articleData['description'] ?? null;
|
$updateData['description'] = $articleData['description'] ?? null;
|
||||||
$updateData['full_article'] = $articleData['full_article'] ?? null;
|
|
||||||
$updateData['thumbnail'] = $articleData['thumbnail'] ?? null;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!isset($articleData['full_article']) || empty($articleData['full_article'])) {
|
if (!isset($articleData['full_article']) || empty($articleData['full_article'])) {
|
||||||
logger()->warning('Article data missing full_article key', [
|
logger()->warning('Article data missing full_article content', [
|
||||||
'article_id' => $article->id,
|
'article_id' => $article->id,
|
||||||
'url' => $article->url
|
'url' => $article->url
|
||||||
]);
|
]);
|
||||||
|
|
@ -36,6 +34,7 @@ public static function validate(Article $article): Article
|
||||||
return $article->refresh();
|
return $article->refresh();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Validate using extracted content (not stored)
|
||||||
$validationResult = self::validateByKeywords($articleData['full_article']);
|
$validationResult = self::validateByKeywords($articleData['full_article']);
|
||||||
$updateData['is_valid'] = $validationResult;
|
$updateData['is_valid'] = $validationResult;
|
||||||
|
|
||||||
|
|
@ -46,8 +45,20 @@ public static function validate(Article $article): Article
|
||||||
|
|
||||||
private static function validateByKeywords(string $full_article): bool
|
private static function validateByKeywords(string $full_article): bool
|
||||||
{
|
{
|
||||||
|
// Belgian news content keywords - broader set for Belgian news relevance
|
||||||
$keywords = [
|
$keywords = [
|
||||||
'N-VA', 'Bart De Wever', 'Frank Vandenbroucke',
|
// Political parties and leaders
|
||||||
|
'N-VA', 'Bart De Wever', 'Frank Vandenbroucke', 'Alexander De Croo',
|
||||||
|
'Vooruit', 'Open Vld', 'CD&V', 'Vlaams Belang', 'PTB', 'PVDA',
|
||||||
|
|
||||||
|
// Belgian locations and institutions
|
||||||
|
'Belgium', 'Belgian', 'Flanders', 'Flemish', 'Wallonia', 'Brussels',
|
||||||
|
'Antwerp', 'Ghent', 'Bruges', 'Leuven', 'Mechelen', 'Namur', 'Liège', 'Charleroi',
|
||||||
|
'parliament', 'government', 'minister', 'policy', 'law', 'legislation',
|
||||||
|
|
||||||
|
// Common Belgian news topics
|
||||||
|
'economy', 'economic', 'education', 'healthcare', 'transport', 'climate', 'energy',
|
||||||
|
'European', 'EU', 'migration', 'security', 'justice', 'culture', 'police'
|
||||||
];
|
];
|
||||||
|
|
||||||
foreach ($keywords as $keyword) {
|
foreach ($keywords as $keyword) {
|
||||||
|
|
|
||||||
|
|
@ -55,15 +55,41 @@ public static function extractFullArticle(string $html): ?string
|
||||||
$cleanHtml = preg_replace('/<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/mi', '', $html);
|
$cleanHtml = preg_replace('/<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/mi', '', $html);
|
||||||
$cleanHtml = preg_replace('/<style\b[^<]*(?:(?!<\/style>)<[^<]*)*<\/style>/mi', '', $cleanHtml);
|
$cleanHtml = preg_replace('/<style\b[^<]*(?:(?!<\/style>)<[^<]*)*<\/style>/mi', '', $cleanHtml);
|
||||||
|
|
||||||
// Try to extract content from Belga-specific document section
|
// Look for Belga-specific paragraph class
|
||||||
|
if (preg_match_all('/<p[^>]*class="[^"]*styles_paragraph__[^"]*"[^>]*>(.*?)<\/p>/is', $cleanHtml, $matches)) {
|
||||||
|
$paragraphs = array_map(function($paragraph) {
|
||||||
|
return html_entity_decode(strip_tags($paragraph), ENT_QUOTES, 'UTF-8');
|
||||||
|
}, $matches[1]);
|
||||||
|
|
||||||
|
// Filter out empty paragraphs and join with double newlines
|
||||||
|
$fullText = implode("\n\n", array_filter($paragraphs, function($p) {
|
||||||
|
return trim($p) !== '';
|
||||||
|
}));
|
||||||
|
|
||||||
|
return $fullText ?: null;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fallback: Try to extract from prezly-slate-document section
|
||||||
if (preg_match('/<section[^>]*class="[^"]*prezly-slate-document[^"]*"[^>]*>(.*?)<\/section>/is', $cleanHtml, $sectionMatches)) {
|
if (preg_match('/<section[^>]*class="[^"]*prezly-slate-document[^"]*"[^>]*>(.*?)<\/section>/is', $cleanHtml, $sectionMatches)) {
|
||||||
$sectionHtml = $sectionMatches[1];
|
$sectionHtml = $sectionMatches[1];
|
||||||
preg_match_all('/<p[^>]*>(.*?)<\/p>/is', $sectionHtml, $matches);
|
preg_match_all('/<p[^>]*>(.*?)<\/p>/is', $sectionHtml, $matches);
|
||||||
} else {
|
|
||||||
// Fallback: Extract all paragraph content
|
if (!empty($matches[1])) {
|
||||||
preg_match_all('/<p[^>]*>(.*?)<\/p>/is', $cleanHtml, $matches);
|
$paragraphs = array_map(function($paragraph) {
|
||||||
|
return html_entity_decode(strip_tags($paragraph), ENT_QUOTES, 'UTF-8');
|
||||||
|
}, $matches[1]);
|
||||||
|
|
||||||
|
// Filter out empty paragraphs and join with double newlines
|
||||||
|
$fullText = implode("\n\n", array_filter($paragraphs, function($p) {
|
||||||
|
return trim($p) !== '';
|
||||||
|
}));
|
||||||
|
|
||||||
|
return $fullText ?: null;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Final fallback: Extract all paragraph content
|
||||||
|
preg_match_all('/<p[^>]*>(.*?)<\/p>/is', $cleanHtml, $matches);
|
||||||
if (!empty($matches[1])) {
|
if (!empty($matches[1])) {
|
||||||
$paragraphs = array_map(function($paragraph) {
|
$paragraphs = array_map(function($paragraph) {
|
||||||
return html_entity_decode(strip_tags($paragraph), ENT_QUOTES, 'UTF-8');
|
return html_entity_decode(strip_tags($paragraph), ENT_QUOTES, 'UTF-8');
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,334 @@
|
||||||
|
<?php
|
||||||
|
|
||||||
|
namespace Tests\Unit\Services\Parsers;
|
||||||
|
|
||||||
|
use App\Services\Parsers\BelgaArticlePageParser;
|
||||||
|
use Tests\TestCase;
|
||||||
|
|
||||||
|
class BelgaArticlePageParserTest extends TestCase
|
||||||
|
{
|
||||||
|
public function test_extract_title_from_og_meta_tag(): void
|
||||||
|
{
|
||||||
|
$html = '<html><head><meta property="og:title" content="Test Article Title"/></head><body></body></html>';
|
||||||
|
|
||||||
|
$title = BelgaArticlePageParser::extractTitle($html);
|
||||||
|
|
||||||
|
$this->assertEquals('Test Article Title', $title);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function test_extract_title_from_h1_tag(): void
|
||||||
|
{
|
||||||
|
$html = '<html><body><h1>H1 Title Test</h1></body></html>';
|
||||||
|
|
||||||
|
$title = BelgaArticlePageParser::extractTitle($html);
|
||||||
|
|
||||||
|
$this->assertEquals('H1 Title Test', $title);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function test_extract_title_from_title_tag(): void
|
||||||
|
{
|
||||||
|
$html = '<html><head><title>Page Title Test</title></head><body></body></html>';
|
||||||
|
|
||||||
|
$title = BelgaArticlePageParser::extractTitle($html);
|
||||||
|
|
||||||
|
$this->assertEquals('Page Title Test', $title);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function test_extract_title_with_html_entities(): void
|
||||||
|
{
|
||||||
|
$html = '<html><head><meta property="og:title" content="Test & Article "Title""/></head></html>';
|
||||||
|
|
||||||
|
$title = BelgaArticlePageParser::extractTitle($html);
|
||||||
|
|
||||||
|
$this->assertEquals('Test & Article "Title"', $title);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function test_extract_title_returns_null_when_not_found(): void
|
||||||
|
{
|
||||||
|
$html = '<html><body><p>No title here</p></body></html>';
|
||||||
|
|
||||||
|
$title = BelgaArticlePageParser::extractTitle($html);
|
||||||
|
|
||||||
|
$this->assertNull($title);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function test_extract_description_from_og_meta_tag(): void
|
||||||
|
{
|
||||||
|
$html = '<html><head><meta property="og:description" content="Test article description"/></head></html>';
|
||||||
|
|
||||||
|
$description = BelgaArticlePageParser::extractDescription($html);
|
||||||
|
|
||||||
|
$this->assertEquals('Test article description', $description);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function test_extract_description_from_paragraph(): void
|
||||||
|
{
|
||||||
|
$html = '<html><body><p>This is the first paragraph description.</p></body></html>';
|
||||||
|
|
||||||
|
$description = BelgaArticlePageParser::extractDescription($html);
|
||||||
|
|
||||||
|
$this->assertEquals('This is the first paragraph description.', $description);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function test_extract_description_with_html_entities(): void
|
||||||
|
{
|
||||||
|
$html = '<html><head><meta property="og:description" content="Description with & entities <test>"/></head></html>';
|
||||||
|
|
||||||
|
$description = BelgaArticlePageParser::extractDescription($html);
|
||||||
|
|
||||||
|
$this->assertEquals('Description with & entities <test>', $description);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function test_extract_description_returns_null_when_not_found(): void
|
||||||
|
{
|
||||||
|
$html = '<html><body><div>No description here</div></body></html>';
|
||||||
|
|
||||||
|
$description = BelgaArticlePageParser::extractDescription($html);
|
||||||
|
|
||||||
|
$this->assertNull($description);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function test_extract_full_article_from_belga_paragraph_class(): void
|
||||||
|
{
|
||||||
|
$html = '
|
||||||
|
<html>
|
||||||
|
<body>
|
||||||
|
<p class="styles_paragraph__6o_o7">First paragraph content.</p>
|
||||||
|
<p class="styles_paragraph__6o_o7">Second paragraph content.</p>
|
||||||
|
<p class="other-class">This should be ignored.</p>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
';
|
||||||
|
|
||||||
|
$fullArticle = BelgaArticlePageParser::extractFullArticle($html);
|
||||||
|
|
||||||
|
$expected = "First paragraph content.\n\nSecond paragraph content.";
|
||||||
|
$this->assertEquals($expected, $fullArticle);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function test_extract_full_article_filters_empty_paragraphs(): void
|
||||||
|
{
|
||||||
|
$html = '
|
||||||
|
<html>
|
||||||
|
<body>
|
||||||
|
<p class="styles_paragraph__ABC123">Content paragraph.</p>
|
||||||
|
<p class="styles_paragraph__DEF456"> </p>
|
||||||
|
<p class="styles_paragraph__GHI789"></p>
|
||||||
|
<p class="styles_paragraph__JKL012">Another content paragraph.</p>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
';
|
||||||
|
|
||||||
|
$fullArticle = BelgaArticlePageParser::extractFullArticle($html);
|
||||||
|
|
||||||
|
$expected = "Content paragraph.\n\nAnother content paragraph.";
|
||||||
|
$this->assertEquals($expected, $fullArticle);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function test_extract_full_article_handles_nested_tags(): void
|
||||||
|
{
|
||||||
|
$html = '
|
||||||
|
<html>
|
||||||
|
<body>
|
||||||
|
<p class="styles_paragraph__TEST">This has <strong>bold text</strong> and <em>italic text</em>.</p>
|
||||||
|
<p class="styles_paragraph__TEST2">This has <a href="#">a link</a> inside.</p>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
';
|
||||||
|
|
||||||
|
$fullArticle = BelgaArticlePageParser::extractFullArticle($html);
|
||||||
|
|
||||||
|
$expected = "This has bold text and italic text.\n\nThis has a link inside.";
|
||||||
|
$this->assertEquals($expected, $fullArticle);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function test_extract_full_article_removes_scripts_and_styles(): void
|
||||||
|
{
|
||||||
|
$html = '
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<script>console.log("test");</script>
|
||||||
|
<style>.test { color: red; }</style>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<p class="styles_paragraph__TEST">Clean content.</p>
|
||||||
|
<script>alert("bad");</script>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
';
|
||||||
|
|
||||||
|
$fullArticle = BelgaArticlePageParser::extractFullArticle($html);
|
||||||
|
|
||||||
|
$this->assertEquals('Clean content.', $fullArticle);
|
||||||
|
$this->assertStringNotContainsString('console.log', $fullArticle);
|
||||||
|
$this->assertStringNotContainsString('alert', $fullArticle);
|
||||||
|
$this->assertStringNotContainsString('color: red', $fullArticle);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function test_extract_full_article_fallback_to_prezly_document(): void
|
||||||
|
{
|
||||||
|
$html = '
|
||||||
|
<html>
|
||||||
|
<body>
|
||||||
|
<section class="prezly-slate-document">
|
||||||
|
<p>Content from prezly section.</p>
|
||||||
|
<p>More prezly content.</p>
|
||||||
|
</section>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
';
|
||||||
|
|
||||||
|
$fullArticle = BelgaArticlePageParser::extractFullArticle($html);
|
||||||
|
|
||||||
|
$expected = "Content from prezly section.\n\nMore prezly content.";
|
||||||
|
$this->assertEquals($expected, $fullArticle);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function test_extract_full_article_fallback_to_all_paragraphs(): void
|
||||||
|
{
|
||||||
|
$html = '
|
||||||
|
<html>
|
||||||
|
<body>
|
||||||
|
<p>First general paragraph.</p>
|
||||||
|
<p>Second general paragraph.</p>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
';
|
||||||
|
|
||||||
|
$fullArticle = BelgaArticlePageParser::extractFullArticle($html);
|
||||||
|
|
||||||
|
$expected = "First general paragraph.\n\nSecond general paragraph.";
|
||||||
|
$this->assertEquals($expected, $fullArticle);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function test_extract_full_article_returns_null_when_no_content(): void
|
||||||
|
{
|
||||||
|
$html = '<html><body><div>No paragraphs here</div></body></html>';
|
||||||
|
|
||||||
|
$fullArticle = BelgaArticlePageParser::extractFullArticle($html);
|
||||||
|
|
||||||
|
$this->assertNull($fullArticle);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function test_extract_thumbnail_from_og_image(): void
|
||||||
|
{
|
||||||
|
$html = '<html><head><meta property="og:image" content="https://example.com/image.jpg"/></head></html>';
|
||||||
|
|
||||||
|
$thumbnail = BelgaArticlePageParser::extractThumbnail($html);
|
||||||
|
|
||||||
|
$this->assertEquals('https://example.com/image.jpg', $thumbnail);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function test_extract_thumbnail_from_img_tag(): void
|
||||||
|
{
|
||||||
|
$html = '<html><body><img src="https://example.com/article-image.png" alt="test"/></body></html>';
|
||||||
|
|
||||||
|
$thumbnail = BelgaArticlePageParser::extractThumbnail($html);
|
||||||
|
|
||||||
|
$this->assertEquals('https://example.com/article-image.png', $thumbnail);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function test_extract_thumbnail_prefers_og_image(): void
|
||||||
|
{
|
||||||
|
$html = '
|
||||||
|
<html>
|
||||||
|
<head><meta property="og:image" content="https://example.com/og-image.jpg"/></head>
|
||||||
|
<body><img src="https://example.com/body-image.png" alt="test"/></body>
|
||||||
|
</html>
|
||||||
|
';
|
||||||
|
|
||||||
|
$thumbnail = BelgaArticlePageParser::extractThumbnail($html);
|
||||||
|
|
||||||
|
$this->assertEquals('https://example.com/og-image.jpg', $thumbnail);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function test_extract_thumbnail_returns_null_when_not_found(): void
|
||||||
|
{
|
||||||
|
$html = '<html><body><div>No images here</div></body></html>';
|
||||||
|
|
||||||
|
$thumbnail = BelgaArticlePageParser::extractThumbnail($html);
|
||||||
|
|
||||||
|
$this->assertNull($thumbnail);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function test_extract_data_returns_all_components(): void
|
||||||
|
{
|
||||||
|
$html = '
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<meta property="og:title" content="Test Article"/>
|
||||||
|
<meta property="og:description" content="Test description"/>
|
||||||
|
<meta property="og:image" content="https://example.com/image.jpg"/>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<p class="styles_paragraph__TEST">Full article content here.</p>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
';
|
||||||
|
|
||||||
|
$data = BelgaArticlePageParser::extractData($html);
|
||||||
|
|
||||||
|
$this->assertIsArray($data);
|
||||||
|
$this->assertArrayHasKey('title', $data);
|
||||||
|
$this->assertArrayHasKey('description', $data);
|
||||||
|
$this->assertArrayHasKey('full_article', $data);
|
||||||
|
$this->assertArrayHasKey('thumbnail', $data);
|
||||||
|
|
||||||
|
$this->assertEquals('Test Article', $data['title']);
|
||||||
|
$this->assertEquals('Test description', $data['description']);
|
||||||
|
$this->assertEquals('Full article content here.', $data['full_article']);
|
||||||
|
$this->assertEquals('https://example.com/image.jpg', $data['thumbnail']);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function test_extract_data_handles_missing_components_gracefully(): void
|
||||||
|
{
|
||||||
|
$html = '<html><body><div>Minimal content</div></body></html>';
|
||||||
|
|
||||||
|
$data = BelgaArticlePageParser::extractData($html);
|
||||||
|
|
||||||
|
$this->assertIsArray($data);
|
||||||
|
$this->assertArrayHasKey('title', $data);
|
||||||
|
$this->assertArrayHasKey('description', $data);
|
||||||
|
$this->assertArrayHasKey('full_article', $data);
|
||||||
|
$this->assertArrayHasKey('thumbnail', $data);
|
||||||
|
|
||||||
|
$this->assertNull($data['title']);
|
||||||
|
$this->assertNull($data['description']);
|
||||||
|
$this->assertNull($data['full_article']);
|
||||||
|
$this->assertNull($data['thumbnail']);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test based on actual Belga HTML structure from real article
|
||||||
|
*/
|
||||||
|
public function test_extract_full_article_with_realistic_belga_html(): void
|
||||||
|
{
|
||||||
|
$html = '
|
||||||
|
<html>
|
||||||
|
<body>
|
||||||
|
<div class="ContentRenderer_renderer__IBbst">
|
||||||
|
<section class="prezly-slate-document">
|
||||||
|
<p class="styles_paragraph__6o_o7"><strong>Around 110,000 people joined the Antwerp Pride Parade on Saturday afternoon, according to police.</strong></p>
|
||||||
|
<p class="styles_paragraph__6o_o7">The event passed without major incidents. Earlier in the day, far-right group Voorpost held a pre-approved protest.</p>
|
||||||
|
<p class="styles_paragraph__6o_o7">Police say they expect no problems with crowd dispersal, as departures will be staggered.</p>
|
||||||
|
</section>
|
||||||
|
</div>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
';
|
||||||
|
|
||||||
|
$fullArticle = BelgaArticlePageParser::extractFullArticle($html);
|
||||||
|
|
||||||
|
$this->assertNotNull($fullArticle);
|
||||||
|
$this->assertStringContainsString('110,000 people joined', $fullArticle);
|
||||||
|
$this->assertStringContainsString('major incidents', $fullArticle);
|
||||||
|
$this->assertStringContainsString('crowd dispersal', $fullArticle);
|
||||||
|
|
||||||
|
// Should join paragraphs with double newlines
|
||||||
|
$this->assertStringContainsString("\n\n", $fullArticle);
|
||||||
|
|
||||||
|
// Should strip HTML tags
|
||||||
|
$this->assertStringNotContainsString('<strong>', $fullArticle);
|
||||||
|
$this->assertStringNotContainsString('</strong>', $fullArticle);
|
||||||
|
}
|
||||||
|
}
|
||||||
191
backend/tests/Unit/Services/ValidationServiceKeywordTest.php
Normal file
191
backend/tests/Unit/Services/ValidationServiceKeywordTest.php
Normal file
|
|
@ -0,0 +1,191 @@
|
||||||
|
<?php
|
||||||
|
|
||||||
|
namespace Tests\Unit\Services;
|
||||||
|
|
||||||
|
use App\Services\Article\ValidationService;
|
||||||
|
use Tests\TestCase;
|
||||||
|
use ReflectionClass;
|
||||||
|
use ReflectionMethod;
|
||||||
|
|
||||||
|
class ValidationServiceKeywordTest extends TestCase
|
||||||
|
{
|
||||||
|
/**
|
||||||
|
* Helper method to access private validateByKeywords method
|
||||||
|
*/
|
||||||
|
private function getValidateByKeywordsMethod(): ReflectionMethod
|
||||||
|
{
|
||||||
|
$reflection = new ReflectionClass(ValidationService::class);
|
||||||
|
$method = $reflection->getMethod('validateByKeywords');
|
||||||
|
$method->setAccessible(true);
|
||||||
|
return $method;
|
||||||
|
}
|
||||||
|
|
||||||
|
public function test_validates_belgian_political_keywords(): void
|
||||||
|
{
|
||||||
|
$method = $this->getValidateByKeywordsMethod();
|
||||||
|
|
||||||
|
$this->assertTrue($method->invoke(null, 'This article discusses N-VA party policies.'));
|
||||||
|
$this->assertTrue($method->invoke(null, 'Bart De Wever made a statement today.'));
|
||||||
|
$this->assertTrue($method->invoke(null, 'Frank Vandenbroucke announced new healthcare policies.'));
|
||||||
|
$this->assertTrue($method->invoke(null, 'Alexander De Croo addressed the nation.'));
|
||||||
|
$this->assertTrue($method->invoke(null, 'The Vooruit party proposed new legislation.'));
|
||||||
|
$this->assertTrue($method->invoke(null, 'Open Vld supports the new budget.'));
|
||||||
|
$this->assertTrue($method->invoke(null, 'CD&V members voted on the proposal.'));
|
||||||
|
$this->assertTrue($method->invoke(null, 'Vlaams Belang criticized the decision.'));
|
||||||
|
$this->assertTrue($method->invoke(null, 'PTB organized a protest yesterday.'));
|
||||||
|
$this->assertTrue($method->invoke(null, 'PVDA released a statement.'));
|
||||||
|
}
|
||||||
|
|
||||||
|
public function test_validates_belgian_location_keywords(): void
|
||||||
|
{
|
||||||
|
$method = $this->getValidateByKeywordsMethod();
|
||||||
|
|
||||||
|
$this->assertTrue($method->invoke(null, 'This event took place in Belgium.'));
|
||||||
|
$this->assertTrue($method->invoke(null, 'The Belgian government announced new policies.'));
|
||||||
|
$this->assertTrue($method->invoke(null, 'Flanders saw increased tourism this year.'));
|
||||||
|
$this->assertTrue($method->invoke(null, 'The Flemish government supports this initiative.'));
|
||||||
|
$this->assertTrue($method->invoke(null, 'Wallonia will receive additional funding.'));
|
||||||
|
$this->assertTrue($method->invoke(null, 'Brussels hosted the international conference.'));
|
||||||
|
$this->assertTrue($method->invoke(null, 'Antwerp Pride attracted thousands of participants.'));
|
||||||
|
$this->assertTrue($method->invoke(null, 'Ghent University published the research.'));
|
||||||
|
$this->assertTrue($method->invoke(null, 'Bruges tourism numbers increased.'));
|
||||||
|
$this->assertTrue($method->invoke(null, 'Leuven students organized the protest.'));
|
||||||
|
$this->assertTrue($method->invoke(null, 'Mechelen city council voted on the proposal.'));
|
||||||
|
$this->assertTrue($method->invoke(null, 'Namur hosted the cultural event.'));
|
||||||
|
$this->assertTrue($method->invoke(null, 'Liège airport saw increased traffic.'));
|
||||||
|
$this->assertTrue($method->invoke(null, 'Charleroi industrial zone expanded.'));
|
||||||
|
}
|
||||||
|
|
||||||
|
public function test_validates_government_keywords(): void
|
||||||
|
{
|
||||||
|
$method = $this->getValidateByKeywordsMethod();
|
||||||
|
|
||||||
|
$this->assertTrue($method->invoke(null, 'Parliament voted on the new legislation.'));
|
||||||
|
$this->assertTrue($method->invoke(null, 'The government announced budget cuts.'));
|
||||||
|
$this->assertTrue($method->invoke(null, 'The minister addressed concerns about healthcare.'));
|
||||||
|
$this->assertTrue($method->invoke(null, 'New policy changes will take effect next month.'));
|
||||||
|
$this->assertTrue($method->invoke(null, 'The law was passed with majority support.'));
|
||||||
|
$this->assertTrue($method->invoke(null, 'New legislation affects education funding.'));
|
||||||
|
}
|
||||||
|
|
||||||
|
public function test_validates_news_topic_keywords(): void
|
||||||
|
{
|
||||||
|
$method = $this->getValidateByKeywordsMethod();
|
||||||
|
|
||||||
|
$this->assertTrue($method->invoke(null, 'The economy showed signs of recovery.'));
|
||||||
|
$this->assertTrue($method->invoke(null, 'Economic indicators improved this quarter.'));
|
||||||
|
$this->assertTrue($method->invoke(null, 'Education reforms were announced today.'));
|
||||||
|
$this->assertTrue($method->invoke(null, 'Healthcare workers received additional support.'));
|
||||||
|
$this->assertTrue($method->invoke(null, 'Transport infrastructure will be upgraded.'));
|
||||||
|
$this->assertTrue($method->invoke(null, 'Climate change policies were discussed.'));
|
||||||
|
$this->assertTrue($method->invoke(null, 'Energy prices have increased significantly.'));
|
||||||
|
$this->assertTrue($method->invoke(null, 'European Union voted on trade agreements.'));
|
||||||
|
$this->assertTrue($method->invoke(null, 'EU sanctions were extended.'));
|
||||||
|
$this->assertTrue($method->invoke(null, 'Migration policies need urgent review.'));
|
||||||
|
$this->assertTrue($method->invoke(null, 'Security measures were enhanced.'));
|
||||||
|
$this->assertTrue($method->invoke(null, 'Justice system reforms are underway.'));
|
||||||
|
$this->assertTrue($method->invoke(null, 'Culture festivals received government funding.'));
|
||||||
|
$this->assertTrue($method->invoke(null, 'Police reported 18 administrative detentions.'));
|
||||||
|
}
|
||||||
|
|
||||||
|
public function test_case_insensitive_keyword_matching(): void
|
||||||
|
{
|
||||||
|
$method = $this->getValidateByKeywordsMethod();
|
||||||
|
|
||||||
|
$this->assertTrue($method->invoke(null, 'This article mentions ANTWERP in capital letters.'));
|
||||||
|
$this->assertTrue($method->invoke(null, 'brussels is mentioned in lowercase.'));
|
||||||
|
$this->assertTrue($method->invoke(null, 'BeLgIuM is mentioned in mixed case.'));
|
||||||
|
$this->assertTrue($method->invoke(null, 'The FLEMISH government announced policies.'));
|
||||||
|
$this->assertTrue($method->invoke(null, 'n-va party policies were discussed.'));
|
||||||
|
$this->assertTrue($method->invoke(null, 'EUROPEAN union directives apply.'));
|
||||||
|
}
|
||||||
|
|
||||||
|
public function test_rejects_content_without_belgian_keywords(): void
|
||||||
|
{
|
||||||
|
$method = $this->getValidateByKeywordsMethod();
|
||||||
|
|
||||||
|
$this->assertFalse($method->invoke(null, 'This article discusses random topics.'));
|
||||||
|
$this->assertFalse($method->invoke(null, 'International news from other countries.'));
|
||||||
|
$this->assertFalse($method->invoke(null, 'Technology updates and innovations.'));
|
||||||
|
$this->assertFalse($method->invoke(null, 'Sports results from around the world.'));
|
||||||
|
$this->assertFalse($method->invoke(null, 'Entertainment news and celebrity gossip.'));
|
||||||
|
$this->assertFalse($method->invoke(null, 'Weather forecast for next week.'));
|
||||||
|
$this->assertFalse($method->invoke(null, 'Stock market analysis and trends.'));
|
||||||
|
}
|
||||||
|
|
||||||
|
public function test_keyword_matching_in_longer_text(): void
|
||||||
|
{
|
||||||
|
$method = $this->getValidateByKeywordsMethod();
|
||||||
|
|
||||||
|
$longText = '
|
||||||
|
This is a comprehensive article about various topics.
|
||||||
|
It covers international relations, global economics, and regional policies.
|
||||||
|
However, it specifically mentions that Antwerp hosted a major conference
|
||||||
|
last week with participants from around the world. The event was
|
||||||
|
considered highly successful and will likely be repeated next year.
|
||||||
|
';
|
||||||
|
|
||||||
|
$this->assertTrue($method->invoke(null, $longText));
|
||||||
|
|
||||||
|
$longTextWithoutKeywords = '
|
||||||
|
This is a comprehensive article about various topics.
|
||||||
|
It covers international relations, global finance, and commercial matters.
|
||||||
|
The conference was held in a major international city and attracted
|
||||||
|
participants from around the world. The event was considered highly
|
||||||
|
successful and will likely be repeated next year.
|
||||||
|
';
|
||||||
|
|
||||||
|
$this->assertFalse($method->invoke(null, $longTextWithoutKeywords));
|
||||||
|
}
|
||||||
|
|
||||||
|
public function test_empty_content_returns_false(): void
|
||||||
|
{
|
||||||
|
$method = $this->getValidateByKeywordsMethod();
|
||||||
|
|
||||||
|
$this->assertFalse($method->invoke(null, ''));
|
||||||
|
$this->assertFalse($method->invoke(null, ' '));
|
||||||
|
$this->assertFalse($method->invoke(null, "\n\n\t"));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test comprehensive keyword coverage to ensure all expected keywords work
|
||||||
|
*/
|
||||||
|
public function test_all_keywords_are_functional(): void
|
||||||
|
{
|
||||||
|
$method = $this->getValidateByKeywordsMethod();
|
||||||
|
|
||||||
|
$expectedKeywords = [
|
||||||
|
// Political parties and leaders
|
||||||
|
'N-VA', 'Bart De Wever', 'Frank Vandenbroucke', 'Alexander De Croo',
|
||||||
|
'Vooruit', 'Open Vld', 'CD&V', 'Vlaams Belang', 'PTB', 'PVDA',
|
||||||
|
|
||||||
|
// Belgian locations and institutions
|
||||||
|
'Belgium', 'Belgian', 'Flanders', 'Flemish', 'Wallonia', 'Brussels',
|
||||||
|
'Antwerp', 'Ghent', 'Bruges', 'Leuven', 'Mechelen', 'Namur', 'Liège', 'Charleroi',
|
||||||
|
'parliament', 'government', 'minister', 'policy', 'law', 'legislation',
|
||||||
|
|
||||||
|
// Common Belgian news topics
|
||||||
|
'economy', 'economic', 'education', 'healthcare', 'transport', 'climate', 'energy',
|
||||||
|
'European', 'EU', 'migration', 'security', 'justice', 'culture', 'police'
|
||||||
|
];
|
||||||
|
|
||||||
|
foreach ($expectedKeywords as $keyword) {
|
||||||
|
$testContent = "This article contains the keyword: {$keyword}.";
|
||||||
|
$result = $method->invoke(null, $testContent);
|
||||||
|
|
||||||
|
$this->assertTrue($result, "Keyword '{$keyword}' should match but didn't");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public function test_partial_keyword_matches_work(): void
|
||||||
|
{
|
||||||
|
$method = $this->getValidateByKeywordsMethod();
|
||||||
|
|
||||||
|
// Keywords should match when they appear as part of larger words or phrases
|
||||||
|
$this->assertTrue($method->invoke(null, 'Anti-government protesters gathered.'));
|
||||||
|
$this->assertTrue($method->invoke(null, 'The policeman directed traffic.'));
|
||||||
|
$this->assertTrue($method->invoke(null, 'Educational reforms are needed.'));
|
||||||
|
$this->assertTrue($method->invoke(null, 'Economic growth accelerated.'));
|
||||||
|
$this->assertTrue($method->invoke(null, 'The European directive was implemented.'));
|
||||||
|
}
|
||||||
|
}
|
||||||
Loading…
Reference in a new issue