diff --git a/backend/app/Services/Article/ValidationService.php b/backend/app/Services/Article/ValidationService.php index ac17e1e..819af4c 100644 --- a/backend/app/Services/Article/ValidationService.php +++ b/backend/app/Services/Article/ValidationService.php @@ -12,7 +12,7 @@ public static function validate(Article $article): Article $articleData = ArticleFetcher::fetchArticleData($article); - // Update article with fetched data (title, description, etc.) + // Update article with fetched metadata (title, description) $updateData = [ 'validated_at' => now(), ]; @@ -20,12 +20,10 @@ public static function validate(Article $article): Article if (!empty($articleData)) { $updateData['title'] = $articleData['title'] ?? null; $updateData['description'] = $articleData['description'] ?? null; - $updateData['full_article'] = $articleData['full_article'] ?? null; - $updateData['thumbnail'] = $articleData['thumbnail'] ?? null; } if (!isset($articleData['full_article']) || empty($articleData['full_article'])) { - logger()->warning('Article data missing full_article key', [ + logger()->warning('Article data missing full_article content', [ 'article_id' => $article->id, 'url' => $article->url ]); @@ -36,6 +34,7 @@ public static function validate(Article $article): Article return $article->refresh(); } + // Validate using extracted content (not stored) $validationResult = self::validateByKeywords($articleData['full_article']); $updateData['is_valid'] = $validationResult; @@ -46,8 +45,20 @@ public static function validate(Article $article): Article private static function validateByKeywords(string $full_article): bool { + // Belgian news content keywords - broader set for Belgian news relevance $keywords = [ - 'N-VA', 'Bart De Wever', 'Frank Vandenbroucke', + // Political parties and leaders + 'N-VA', 'Bart De Wever', 'Frank Vandenbroucke', 'Alexander De Croo', + 'Vooruit', 'Open Vld', 'CD&V', 'Vlaams Belang', 'PTB', 'PVDA', + + // Belgian locations and institutions + 'Belgium', 'Belgian', 'Flanders', 'Flemish', 'Wallonia', 'Brussels', + 'Antwerp', 'Ghent', 'Bruges', 'Leuven', 'Mechelen', 'Namur', 'Liège', 'Charleroi', + 'parliament', 'government', 'minister', 'policy', 'law', 'legislation', + + // Common Belgian news topics + 'economy', 'economic', 'education', 'healthcare', 'transport', 'climate', 'energy', + 'European', 'EU', 'migration', 'security', 'justice', 'culture', 'police' ]; foreach ($keywords as $keyword) { diff --git a/backend/app/Services/Parsers/BelgaArticlePageParser.php b/backend/app/Services/Parsers/BelgaArticlePageParser.php index 0a2d2dd..b438d32 100644 --- a/backend/app/Services/Parsers/BelgaArticlePageParser.php +++ b/backend/app/Services/Parsers/BelgaArticlePageParser.php @@ -55,15 +55,41 @@ public static function extractFullArticle(string $html): ?string $cleanHtml = preg_replace('/ + + +
+Clean content.
+ + +