From 5c666e62af6726a0b63d7f00055c8c4831c911d3 Mon Sep 17 00:00:00 2001 From: myrmidex Date: Sat, 9 Aug 2025 21:32:46 +0200 Subject: [PATCH] Fix article validation --- .../Services/Article/ValidationService.php | 21 +- .../Parsers/BelgaArticlePageParser.php | 34 +- .../Parsers/BelgaArticlePageParserTest.php | 334 ++++++++++++++++++ .../Services/ValidationServiceKeywordTest.php | 191 ++++++++++ 4 files changed, 571 insertions(+), 9 deletions(-) create mode 100644 backend/tests/Unit/Services/Parsers/BelgaArticlePageParserTest.php create mode 100644 backend/tests/Unit/Services/ValidationServiceKeywordTest.php diff --git a/backend/app/Services/Article/ValidationService.php b/backend/app/Services/Article/ValidationService.php index ac17e1e..819af4c 100644 --- a/backend/app/Services/Article/ValidationService.php +++ b/backend/app/Services/Article/ValidationService.php @@ -12,7 +12,7 @@ public static function validate(Article $article): Article $articleData = ArticleFetcher::fetchArticleData($article); - // Update article with fetched data (title, description, etc.) + // Update article with fetched metadata (title, description) $updateData = [ 'validated_at' => now(), ]; @@ -20,12 +20,10 @@ public static function validate(Article $article): Article if (!empty($articleData)) { $updateData['title'] = $articleData['title'] ?? null; $updateData['description'] = $articleData['description'] ?? null; - $updateData['full_article'] = $articleData['full_article'] ?? null; - $updateData['thumbnail'] = $articleData['thumbnail'] ?? null; } if (!isset($articleData['full_article']) || empty($articleData['full_article'])) { - logger()->warning('Article data missing full_article key', [ + logger()->warning('Article data missing full_article content', [ 'article_id' => $article->id, 'url' => $article->url ]); @@ -36,6 +34,7 @@ public static function validate(Article $article): Article return $article->refresh(); } + // Validate using extracted content (not stored) $validationResult = self::validateByKeywords($articleData['full_article']); $updateData['is_valid'] = $validationResult; @@ -46,8 +45,20 @@ public static function validate(Article $article): Article private static function validateByKeywords(string $full_article): bool { + // Belgian news content keywords - broader set for Belgian news relevance $keywords = [ - 'N-VA', 'Bart De Wever', 'Frank Vandenbroucke', + // Political parties and leaders + 'N-VA', 'Bart De Wever', 'Frank Vandenbroucke', 'Alexander De Croo', + 'Vooruit', 'Open Vld', 'CD&V', 'Vlaams Belang', 'PTB', 'PVDA', + + // Belgian locations and institutions + 'Belgium', 'Belgian', 'Flanders', 'Flemish', 'Wallonia', 'Brussels', + 'Antwerp', 'Ghent', 'Bruges', 'Leuven', 'Mechelen', 'Namur', 'Liège', 'Charleroi', + 'parliament', 'government', 'minister', 'policy', 'law', 'legislation', + + // Common Belgian news topics + 'economy', 'economic', 'education', 'healthcare', 'transport', 'climate', 'energy', + 'European', 'EU', 'migration', 'security', 'justice', 'culture', 'police' ]; foreach ($keywords as $keyword) { diff --git a/backend/app/Services/Parsers/BelgaArticlePageParser.php b/backend/app/Services/Parsers/BelgaArticlePageParser.php index 0a2d2dd..b438d32 100644 --- a/backend/app/Services/Parsers/BelgaArticlePageParser.php +++ b/backend/app/Services/Parsers/BelgaArticlePageParser.php @@ -55,15 +55,41 @@ public static function extractFullArticle(string $html): ?string $cleanHtml = preg_replace('/)<[^<]*)*<\/script>/mi', '', $html); $cleanHtml = preg_replace('/)<[^<]*)*<\/style>/mi', '', $cleanHtml); - // Try to extract content from Belga-specific document section + // Look for Belga-specific paragraph class + if (preg_match_all('/]*class="[^"]*styles_paragraph__[^"]*"[^>]*>(.*?)<\/p>/is', $cleanHtml, $matches)) { + $paragraphs = array_map(function($paragraph) { + return html_entity_decode(strip_tags($paragraph), ENT_QUOTES, 'UTF-8'); + }, $matches[1]); + + // Filter out empty paragraphs and join with double newlines + $fullText = implode("\n\n", array_filter($paragraphs, function($p) { + return trim($p) !== ''; + })); + + return $fullText ?: null; + } + + // Fallback: Try to extract from prezly-slate-document section if (preg_match('/]*class="[^"]*prezly-slate-document[^"]*"[^>]*>(.*?)<\/section>/is', $cleanHtml, $sectionMatches)) { $sectionHtml = $sectionMatches[1]; preg_match_all('/]*>(.*?)<\/p>/is', $sectionHtml, $matches); - } else { - // Fallback: Extract all paragraph content - preg_match_all('/]*>(.*?)<\/p>/is', $cleanHtml, $matches); + + if (!empty($matches[1])) { + $paragraphs = array_map(function($paragraph) { + return html_entity_decode(strip_tags($paragraph), ENT_QUOTES, 'UTF-8'); + }, $matches[1]); + + // Filter out empty paragraphs and join with double newlines + $fullText = implode("\n\n", array_filter($paragraphs, function($p) { + return trim($p) !== ''; + })); + + return $fullText ?: null; + } } + // Final fallback: Extract all paragraph content + preg_match_all('/]*>(.*?)<\/p>/is', $cleanHtml, $matches); if (!empty($matches[1])) { $paragraphs = array_map(function($paragraph) { return html_entity_decode(strip_tags($paragraph), ENT_QUOTES, 'UTF-8'); diff --git a/backend/tests/Unit/Services/Parsers/BelgaArticlePageParserTest.php b/backend/tests/Unit/Services/Parsers/BelgaArticlePageParserTest.php new file mode 100644 index 0000000..cb99b5e --- /dev/null +++ b/backend/tests/Unit/Services/Parsers/BelgaArticlePageParserTest.php @@ -0,0 +1,334 @@ +'; + + $title = BelgaArticlePageParser::extractTitle($html); + + $this->assertEquals('Test Article Title', $title); + } + + public function test_extract_title_from_h1_tag(): void + { + $html = '

H1 Title Test

'; + + $title = BelgaArticlePageParser::extractTitle($html); + + $this->assertEquals('H1 Title Test', $title); + } + + public function test_extract_title_from_title_tag(): void + { + $html = 'Page Title Test'; + + $title = BelgaArticlePageParser::extractTitle($html); + + $this->assertEquals('Page Title Test', $title); + } + + public function test_extract_title_with_html_entities(): void + { + $html = ''; + + $title = BelgaArticlePageParser::extractTitle($html); + + $this->assertEquals('Test & Article "Title"', $title); + } + + public function test_extract_title_returns_null_when_not_found(): void + { + $html = '

No title here

'; + + $title = BelgaArticlePageParser::extractTitle($html); + + $this->assertNull($title); + } + + public function test_extract_description_from_og_meta_tag(): void + { + $html = ''; + + $description = BelgaArticlePageParser::extractDescription($html); + + $this->assertEquals('Test article description', $description); + } + + public function test_extract_description_from_paragraph(): void + { + $html = '

This is the first paragraph description.

'; + + $description = BelgaArticlePageParser::extractDescription($html); + + $this->assertEquals('This is the first paragraph description.', $description); + } + + public function test_extract_description_with_html_entities(): void + { + $html = ''; + + $description = BelgaArticlePageParser::extractDescription($html); + + $this->assertEquals('Description with & entities ', $description); + } + + public function test_extract_description_returns_null_when_not_found(): void + { + $html = '
No description here
'; + + $description = BelgaArticlePageParser::extractDescription($html); + + $this->assertNull($description); + } + + public function test_extract_full_article_from_belga_paragraph_class(): void + { + $html = ' + + +

First paragraph content.

+

Second paragraph content.

+

This should be ignored.

+ + + '; + + $fullArticle = BelgaArticlePageParser::extractFullArticle($html); + + $expected = "First paragraph content.\n\nSecond paragraph content."; + $this->assertEquals($expected, $fullArticle); + } + + public function test_extract_full_article_filters_empty_paragraphs(): void + { + $html = ' + + +

Content paragraph.

+

+

+

Another content paragraph.

+ + + '; + + $fullArticle = BelgaArticlePageParser::extractFullArticle($html); + + $expected = "Content paragraph.\n\nAnother content paragraph."; + $this->assertEquals($expected, $fullArticle); + } + + public function test_extract_full_article_handles_nested_tags(): void + { + $html = ' + + +

This has bold text and italic text.

+

This has a link inside.

+ + + '; + + $fullArticle = BelgaArticlePageParser::extractFullArticle($html); + + $expected = "This has bold text and italic text.\n\nThis has a link inside."; + $this->assertEquals($expected, $fullArticle); + } + + public function test_extract_full_article_removes_scripts_and_styles(): void + { + $html = ' + + + + + + +

Clean content.

+ + + + '; + + $fullArticle = BelgaArticlePageParser::extractFullArticle($html); + + $this->assertEquals('Clean content.', $fullArticle); + $this->assertStringNotContainsString('console.log', $fullArticle); + $this->assertStringNotContainsString('alert', $fullArticle); + $this->assertStringNotContainsString('color: red', $fullArticle); + } + + public function test_extract_full_article_fallback_to_prezly_document(): void + { + $html = ' + + +
+

Content from prezly section.

+

More prezly content.

+
+ + + '; + + $fullArticle = BelgaArticlePageParser::extractFullArticle($html); + + $expected = "Content from prezly section.\n\nMore prezly content."; + $this->assertEquals($expected, $fullArticle); + } + + public function test_extract_full_article_fallback_to_all_paragraphs(): void + { + $html = ' + + +

First general paragraph.

+

Second general paragraph.

+ + + '; + + $fullArticle = BelgaArticlePageParser::extractFullArticle($html); + + $expected = "First general paragraph.\n\nSecond general paragraph."; + $this->assertEquals($expected, $fullArticle); + } + + public function test_extract_full_article_returns_null_when_no_content(): void + { + $html = '
No paragraphs here
'; + + $fullArticle = BelgaArticlePageParser::extractFullArticle($html); + + $this->assertNull($fullArticle); + } + + public function test_extract_thumbnail_from_og_image(): void + { + $html = ''; + + $thumbnail = BelgaArticlePageParser::extractThumbnail($html); + + $this->assertEquals('https://example.com/image.jpg', $thumbnail); + } + + public function test_extract_thumbnail_from_img_tag(): void + { + $html = 'test'; + + $thumbnail = BelgaArticlePageParser::extractThumbnail($html); + + $this->assertEquals('https://example.com/article-image.png', $thumbnail); + } + + public function test_extract_thumbnail_prefers_og_image(): void + { + $html = ' + + + test + + '; + + $thumbnail = BelgaArticlePageParser::extractThumbnail($html); + + $this->assertEquals('https://example.com/og-image.jpg', $thumbnail); + } + + public function test_extract_thumbnail_returns_null_when_not_found(): void + { + $html = '
No images here
'; + + $thumbnail = BelgaArticlePageParser::extractThumbnail($html); + + $this->assertNull($thumbnail); + } + + public function test_extract_data_returns_all_components(): void + { + $html = ' + + + + + + + +

Full article content here.

+ + + '; + + $data = BelgaArticlePageParser::extractData($html); + + $this->assertIsArray($data); + $this->assertArrayHasKey('title', $data); + $this->assertArrayHasKey('description', $data); + $this->assertArrayHasKey('full_article', $data); + $this->assertArrayHasKey('thumbnail', $data); + + $this->assertEquals('Test Article', $data['title']); + $this->assertEquals('Test description', $data['description']); + $this->assertEquals('Full article content here.', $data['full_article']); + $this->assertEquals('https://example.com/image.jpg', $data['thumbnail']); + } + + public function test_extract_data_handles_missing_components_gracefully(): void + { + $html = '
Minimal content
'; + + $data = BelgaArticlePageParser::extractData($html); + + $this->assertIsArray($data); + $this->assertArrayHasKey('title', $data); + $this->assertArrayHasKey('description', $data); + $this->assertArrayHasKey('full_article', $data); + $this->assertArrayHasKey('thumbnail', $data); + + $this->assertNull($data['title']); + $this->assertNull($data['description']); + $this->assertNull($data['full_article']); + $this->assertNull($data['thumbnail']); + } + + /** + * Test based on actual Belga HTML structure from real article + */ + public function test_extract_full_article_with_realistic_belga_html(): void + { + $html = ' + + +
+
+

Around 110,000 people joined the Antwerp Pride Parade on Saturday afternoon, according to police.

+

The event passed without major incidents. Earlier in the day, far-right group Voorpost held a pre-approved protest.

+

Police say they expect no problems with crowd dispersal, as departures will be staggered.

+
+
+ + + '; + + $fullArticle = BelgaArticlePageParser::extractFullArticle($html); + + $this->assertNotNull($fullArticle); + $this->assertStringContainsString('110,000 people joined', $fullArticle); + $this->assertStringContainsString('major incidents', $fullArticle); + $this->assertStringContainsString('crowd dispersal', $fullArticle); + + // Should join paragraphs with double newlines + $this->assertStringContainsString("\n\n", $fullArticle); + + // Should strip HTML tags + $this->assertStringNotContainsString('', $fullArticle); + $this->assertStringNotContainsString('', $fullArticle); + } +} \ No newline at end of file diff --git a/backend/tests/Unit/Services/ValidationServiceKeywordTest.php b/backend/tests/Unit/Services/ValidationServiceKeywordTest.php new file mode 100644 index 0000000..88f9152 --- /dev/null +++ b/backend/tests/Unit/Services/ValidationServiceKeywordTest.php @@ -0,0 +1,191 @@ +getMethod('validateByKeywords'); + $method->setAccessible(true); + return $method; + } + + public function test_validates_belgian_political_keywords(): void + { + $method = $this->getValidateByKeywordsMethod(); + + $this->assertTrue($method->invoke(null, 'This article discusses N-VA party policies.')); + $this->assertTrue($method->invoke(null, 'Bart De Wever made a statement today.')); + $this->assertTrue($method->invoke(null, 'Frank Vandenbroucke announced new healthcare policies.')); + $this->assertTrue($method->invoke(null, 'Alexander De Croo addressed the nation.')); + $this->assertTrue($method->invoke(null, 'The Vooruit party proposed new legislation.')); + $this->assertTrue($method->invoke(null, 'Open Vld supports the new budget.')); + $this->assertTrue($method->invoke(null, 'CD&V members voted on the proposal.')); + $this->assertTrue($method->invoke(null, 'Vlaams Belang criticized the decision.')); + $this->assertTrue($method->invoke(null, 'PTB organized a protest yesterday.')); + $this->assertTrue($method->invoke(null, 'PVDA released a statement.')); + } + + public function test_validates_belgian_location_keywords(): void + { + $method = $this->getValidateByKeywordsMethod(); + + $this->assertTrue($method->invoke(null, 'This event took place in Belgium.')); + $this->assertTrue($method->invoke(null, 'The Belgian government announced new policies.')); + $this->assertTrue($method->invoke(null, 'Flanders saw increased tourism this year.')); + $this->assertTrue($method->invoke(null, 'The Flemish government supports this initiative.')); + $this->assertTrue($method->invoke(null, 'Wallonia will receive additional funding.')); + $this->assertTrue($method->invoke(null, 'Brussels hosted the international conference.')); + $this->assertTrue($method->invoke(null, 'Antwerp Pride attracted thousands of participants.')); + $this->assertTrue($method->invoke(null, 'Ghent University published the research.')); + $this->assertTrue($method->invoke(null, 'Bruges tourism numbers increased.')); + $this->assertTrue($method->invoke(null, 'Leuven students organized the protest.')); + $this->assertTrue($method->invoke(null, 'Mechelen city council voted on the proposal.')); + $this->assertTrue($method->invoke(null, 'Namur hosted the cultural event.')); + $this->assertTrue($method->invoke(null, 'Liège airport saw increased traffic.')); + $this->assertTrue($method->invoke(null, 'Charleroi industrial zone expanded.')); + } + + public function test_validates_government_keywords(): void + { + $method = $this->getValidateByKeywordsMethod(); + + $this->assertTrue($method->invoke(null, 'Parliament voted on the new legislation.')); + $this->assertTrue($method->invoke(null, 'The government announced budget cuts.')); + $this->assertTrue($method->invoke(null, 'The minister addressed concerns about healthcare.')); + $this->assertTrue($method->invoke(null, 'New policy changes will take effect next month.')); + $this->assertTrue($method->invoke(null, 'The law was passed with majority support.')); + $this->assertTrue($method->invoke(null, 'New legislation affects education funding.')); + } + + public function test_validates_news_topic_keywords(): void + { + $method = $this->getValidateByKeywordsMethod(); + + $this->assertTrue($method->invoke(null, 'The economy showed signs of recovery.')); + $this->assertTrue($method->invoke(null, 'Economic indicators improved this quarter.')); + $this->assertTrue($method->invoke(null, 'Education reforms were announced today.')); + $this->assertTrue($method->invoke(null, 'Healthcare workers received additional support.')); + $this->assertTrue($method->invoke(null, 'Transport infrastructure will be upgraded.')); + $this->assertTrue($method->invoke(null, 'Climate change policies were discussed.')); + $this->assertTrue($method->invoke(null, 'Energy prices have increased significantly.')); + $this->assertTrue($method->invoke(null, 'European Union voted on trade agreements.')); + $this->assertTrue($method->invoke(null, 'EU sanctions were extended.')); + $this->assertTrue($method->invoke(null, 'Migration policies need urgent review.')); + $this->assertTrue($method->invoke(null, 'Security measures were enhanced.')); + $this->assertTrue($method->invoke(null, 'Justice system reforms are underway.')); + $this->assertTrue($method->invoke(null, 'Culture festivals received government funding.')); + $this->assertTrue($method->invoke(null, 'Police reported 18 administrative detentions.')); + } + + public function test_case_insensitive_keyword_matching(): void + { + $method = $this->getValidateByKeywordsMethod(); + + $this->assertTrue($method->invoke(null, 'This article mentions ANTWERP in capital letters.')); + $this->assertTrue($method->invoke(null, 'brussels is mentioned in lowercase.')); + $this->assertTrue($method->invoke(null, 'BeLgIuM is mentioned in mixed case.')); + $this->assertTrue($method->invoke(null, 'The FLEMISH government announced policies.')); + $this->assertTrue($method->invoke(null, 'n-va party policies were discussed.')); + $this->assertTrue($method->invoke(null, 'EUROPEAN union directives apply.')); + } + + public function test_rejects_content_without_belgian_keywords(): void + { + $method = $this->getValidateByKeywordsMethod(); + + $this->assertFalse($method->invoke(null, 'This article discusses random topics.')); + $this->assertFalse($method->invoke(null, 'International news from other countries.')); + $this->assertFalse($method->invoke(null, 'Technology updates and innovations.')); + $this->assertFalse($method->invoke(null, 'Sports results from around the world.')); + $this->assertFalse($method->invoke(null, 'Entertainment news and celebrity gossip.')); + $this->assertFalse($method->invoke(null, 'Weather forecast for next week.')); + $this->assertFalse($method->invoke(null, 'Stock market analysis and trends.')); + } + + public function test_keyword_matching_in_longer_text(): void + { + $method = $this->getValidateByKeywordsMethod(); + + $longText = ' + This is a comprehensive article about various topics. + It covers international relations, global economics, and regional policies. + However, it specifically mentions that Antwerp hosted a major conference + last week with participants from around the world. The event was + considered highly successful and will likely be repeated next year. + '; + + $this->assertTrue($method->invoke(null, $longText)); + + $longTextWithoutKeywords = ' + This is a comprehensive article about various topics. + It covers international relations, global finance, and commercial matters. + The conference was held in a major international city and attracted + participants from around the world. The event was considered highly + successful and will likely be repeated next year. + '; + + $this->assertFalse($method->invoke(null, $longTextWithoutKeywords)); + } + + public function test_empty_content_returns_false(): void + { + $method = $this->getValidateByKeywordsMethod(); + + $this->assertFalse($method->invoke(null, '')); + $this->assertFalse($method->invoke(null, ' ')); + $this->assertFalse($method->invoke(null, "\n\n\t")); + } + + /** + * Test comprehensive keyword coverage to ensure all expected keywords work + */ + public function test_all_keywords_are_functional(): void + { + $method = $this->getValidateByKeywordsMethod(); + + $expectedKeywords = [ + // Political parties and leaders + 'N-VA', 'Bart De Wever', 'Frank Vandenbroucke', 'Alexander De Croo', + 'Vooruit', 'Open Vld', 'CD&V', 'Vlaams Belang', 'PTB', 'PVDA', + + // Belgian locations and institutions + 'Belgium', 'Belgian', 'Flanders', 'Flemish', 'Wallonia', 'Brussels', + 'Antwerp', 'Ghent', 'Bruges', 'Leuven', 'Mechelen', 'Namur', 'Liège', 'Charleroi', + 'parliament', 'government', 'minister', 'policy', 'law', 'legislation', + + // Common Belgian news topics + 'economy', 'economic', 'education', 'healthcare', 'transport', 'climate', 'energy', + 'European', 'EU', 'migration', 'security', 'justice', 'culture', 'police' + ]; + + foreach ($expectedKeywords as $keyword) { + $testContent = "This article contains the keyword: {$keyword}."; + $result = $method->invoke(null, $testContent); + + $this->assertTrue($result, "Keyword '{$keyword}' should match but didn't"); + } + } + + public function test_partial_keyword_matches_work(): void + { + $method = $this->getValidateByKeywordsMethod(); + + // Keywords should match when they appear as part of larger words or phrases + $this->assertTrue($method->invoke(null, 'Anti-government protesters gathered.')); + $this->assertTrue($method->invoke(null, 'The policeman directed traffic.')); + $this->assertTrue($method->invoke(null, 'Educational reforms are needed.')); + $this->assertTrue($method->invoke(null, 'Economic growth accelerated.')); + $this->assertTrue($method->invoke(null, 'The European directive was implemented.')); + } +} \ No newline at end of file