From 81b3c7f70bdc770e231e07ed7ef345ad98711187 Mon Sep 17 00:00:00 2001 From: myrmidex Date: Tue, 28 Apr 2026 01:09:18 +0200 Subject: [PATCH] 13 - Wire LanguageDetectionService into FetchPageAction with lang attr fallback --- app/Actions/FetchPageAction.php | 36 +++- tests/Feature/Actions/FetchPageActionTest.php | 181 ++++++++++++++++++ 2 files changed, 215 insertions(+), 2 deletions(-) diff --git a/app/Actions/FetchPageAction.php b/app/Actions/FetchPageAction.php index ec92a8d..e906b15 100644 --- a/app/Actions/FetchPageAction.php +++ b/app/Actions/FetchPageAction.php @@ -5,6 +5,7 @@ namespace App\Actions; use App\Enums\CrawlOutcomeEnum; +use App\Services\LanguageDetectionService; use App\Services\UrlService; use App\ValueObjects\FetchResult; use fivefilters\Readability\Configuration; @@ -20,9 +21,14 @@ class FetchPageAction { + private const MIN_WORDS_FOR_TEXT_DETECTION = 20; + + private const MIN_TEXT_DETECTION_CONFIDENCE = 0.30; + public function __construct( private Factory $http, private UrlService $urlService, + private LanguageDetectionService $languageDetection, ) {} public function __invoke(string $url): FetchResult @@ -46,8 +52,9 @@ public function __invoke(string $url): FetchResult [$outcome, $error] = $this->validateResponse($response); if ($outcome === CrawlOutcomeEnum::Success) { - [$title, $extractedText, $links] = $this->extractTitleTextAndLinks($response->body(), $url); + [$title, $extractedText, $links, $crawler] = $this->extractTitleTextAndLinks($response->body(), $url); $wordCount = $extractedText !== '' ? count(preg_split('/\s+/u', trim($extractedText))) : 0; + [$language, $languageConfidence] = $this->detectLanguage($crawler, $extractedText, $wordCount); } return new FetchResult( @@ -59,6 +66,8 @@ public function __invoke(string $url): FetchResult outboundLinks: $links ?? collect(), wordCount: $wordCount ?? null, errorMessage: $error ?? null, + language: $language ?? null, + languageConfidence: $languageConfidence ?? null, ); } @@ -135,7 +144,7 @@ private function extractTitleTextAndLinks(string $body, string $url): array ->unique() ->values(); - return [$title, $extractedText, $linksResolved]; + return [$title, $extractedText, $linksResolved, $crawler]; } private function resolveAndValidateLink(string $href, string $finalUrl): ?string @@ -159,4 +168,27 @@ private function resolveAndValidateLink(string $href, string $finalUrl): ?string return $resolved; } + + /** + * @return array{0: ?string, 1: ?float} + */ + private function detectLanguage(Crawler $crawler, string $extractedText, ?int $wordCount = null): array + { + if ($wordCount >= self::MIN_WORDS_FOR_TEXT_DETECTION) { + $result = $this->languageDetection->detect($extractedText); + if ($result !== null && $result[1] >= self::MIN_TEXT_DETECTION_CONFIDENCE) { + return [$result[0], $result[1]]; + } + } + + $lang = $crawler->filter('html')->count() > 0 + ? trim($crawler->filter('html')->attr('lang') ?? '') + : ''; + + if ($lang !== '' && strlen($lang) <= 35) { + return [$lang, 1.0]; + } + + return [null, null]; + } } diff --git a/tests/Feature/Actions/FetchPageActionTest.php b/tests/Feature/Actions/FetchPageActionTest.php index b5f415a..826c755 100644 --- a/tests/Feature/Actions/FetchPageActionTest.php +++ b/tests/Feature/Actions/FetchPageActionTest.php @@ -6,6 +6,7 @@ use App\Actions\FetchPageAction; use App\Enums\CrawlOutcomeEnum; +use App\Services\LanguageDetectionService; use App\ValueObjects\FetchResult; use GuzzleHttp\Exception\ConnectException; use GuzzleHttp\Psr7\Request; @@ -323,6 +324,186 @@ public function test_fragment_only_href_is_filtered_from_outbound_links(): void $this->assertSame(0, $result->outboundLinks->count()); } + public function test_sufficient_text_triggers_language_detection_and_result_propagates(): void + { + // 24 words — above the detection threshold + $body = <<<'HTML' + + + Language Detection Test + +
+

The quick brown fox jumps over the lazy dog and then runs away into the forest + where many other animals live and play together every single day.

+
+ + + HTML; + + Http::fake([ + 'example.com/*' => Http::response($body, 200, ['Content-Type' => 'text/html']), + ]); + + $detection = $this->mock(LanguageDetectionService::class); + $detection->shouldReceive('detect') + ->once() + ->andReturn(['en', 0.95]); + + $result = $this->makeAction()('https://example.com/article'); + + $this->assertSame(CrawlOutcomeEnum::Success, $result->outcome); + $this->assertSame('en', $result->language); + $this->assertSame(0.95, $result->languageConfidence); + } + + public function test_short_body_with_html_lang_attr_skips_service_and_uses_lang_attr(): void + { + // 7 words — below the detection threshold + $html = <<<'HTML' + + + Short Page + +
+

Too short to detect language automatically.

+
+ + + HTML; + + Http::fake([ + 'example.com/*' => Http::response($html, 200, ['Content-Type' => 'text/html']), + ]); + + $detection = $this->mock(LanguageDetectionService::class); + $detection->shouldReceive('detect')->never(); + + $result = $this->makeAction()('https://example.com/article'); + + $this->assertSame(CrawlOutcomeEnum::Success, $result->outcome); + $this->assertSame('pt-BR', $result->language); + $this->assertSame(1.0, $result->languageConfidence); + } + + public function test_short_body_with_no_lang_attr_returns_null_language(): void + { + // 7 words — below the detection threshold + $html = <<<'HTML' + + + Short Page + +
+

Too short to detect language automatically.

+
+ + + HTML; + + Http::fake([ + 'example.com/*' => Http::response($html, 200, ['Content-Type' => 'text/html']), + ]); + + $detection = $this->mock(LanguageDetectionService::class); + $detection->shouldReceive('detect')->never(); + + $result = $this->makeAction()('https://example.com/article'); + + $this->assertSame(CrawlOutcomeEnum::Success, $result->outcome); + $this->assertNull($result->language); + $this->assertNull($result->languageConfidence); + } + + public function test_whitespace_only_lang_attr_is_treated_as_absent(): void + { + // 7 words — below the detection threshold; lang attr is blank/whitespace-only + $html = <<<'HTML' + + + Short Page + +
+

Too short to detect language automatically.

+
+ + + HTML; + + Http::fake([ + 'example.com/*' => Http::response($html, 200, ['Content-Type' => 'text/html']), + ]); + + $detection = $this->mock(LanguageDetectionService::class); + $detection->shouldReceive('detect')->never(); + + $result = $this->makeAction()('https://example.com/article'); + + $this->assertSame(CrawlOutcomeEnum::Success, $result->outcome); + $this->assertNull($result->language); + $this->assertNull($result->languageConfidence); + } + + public function test_lang_attr_longer_than_35_chars_is_rejected(): void + { + // 7 words — below the detection threshold; lang attr exceeds BCP-47 column width (string(35)) + $html = <<<'HTML' + + + Short Page + +
+

Too short to detect language automatically.

+
+ + + HTML; + + Http::fake([ + 'example.com/*' => Http::response($html, 200, ['Content-Type' => 'text/html']), + ]); + + $detection = $this->mock(LanguageDetectionService::class); + $detection->shouldReceive('detect')->never(); + + $result = $this->makeAction()('https://example.com/article'); + + $this->assertSame(CrawlOutcomeEnum::Success, $result->outcome); + $this->assertNull($result->language); + $this->assertNull($result->languageConfidence); + } + + public function test_low_confidence_detection_falls_through_to_lang_attr(): void + { + // 24 words — above the detection threshold; service returns low-confidence result + $html = <<<'HTML' + + + Confidence Floor Test + +
+

The quick brown fox jumps over the lazy dog and then runs away into the forest + where many other animals live and play together every single day.

+
+ + + HTML; + + Http::fake([ + 'example.com/*' => Http::response($html, 200, ['Content-Type' => 'text/html']), + ]); + + $detection = $this->mock(LanguageDetectionService::class); + $detection->shouldReceive('detect') + ->once() + ->andReturn(['xx', 0.15]); + + $result = $this->makeAction()('https://example.com/article'); + + $this->assertSame(CrawlOutcomeEnum::Success, $result->outcome); + $this->assertSame('en-US', $result->language); + $this->assertSame(1.0, $result->languageConfidence); + } + private function makeAction(): FetchPageAction { return app(FetchPageAction::class);