The quick brown fox jumps over the lazy dog and then runs away into the forest + where many other animals live and play together every single day.
+diff --git a/app/Actions/FetchPageAction.php b/app/Actions/FetchPageAction.php index ec92a8d..e906b15 100644 --- a/app/Actions/FetchPageAction.php +++ b/app/Actions/FetchPageAction.php @@ -5,6 +5,7 @@ namespace App\Actions; use App\Enums\CrawlOutcomeEnum; +use App\Services\LanguageDetectionService; use App\Services\UrlService; use App\ValueObjects\FetchResult; use fivefilters\Readability\Configuration; @@ -20,9 +21,14 @@ class FetchPageAction { + private const MIN_WORDS_FOR_TEXT_DETECTION = 20; + + private const MIN_TEXT_DETECTION_CONFIDENCE = 0.30; + public function __construct( private Factory $http, private UrlService $urlService, + private LanguageDetectionService $languageDetection, ) {} public function __invoke(string $url): FetchResult @@ -46,8 +52,9 @@ public function __invoke(string $url): FetchResult [$outcome, $error] = $this->validateResponse($response); if ($outcome === CrawlOutcomeEnum::Success) { - [$title, $extractedText, $links] = $this->extractTitleTextAndLinks($response->body(), $url); + [$title, $extractedText, $links, $crawler] = $this->extractTitleTextAndLinks($response->body(), $url); $wordCount = $extractedText !== '' ? count(preg_split('/\s+/u', trim($extractedText))) : 0; + [$language, $languageConfidence] = $this->detectLanguage($crawler, $extractedText, $wordCount); } return new FetchResult( @@ -59,6 +66,8 @@ public function __invoke(string $url): FetchResult outboundLinks: $links ?? collect(), wordCount: $wordCount ?? null, errorMessage: $error ?? null, + language: $language ?? null, + languageConfidence: $languageConfidence ?? null, ); } @@ -135,7 +144,7 @@ private function extractTitleTextAndLinks(string $body, string $url): array ->unique() ->values(); - return [$title, $extractedText, $linksResolved]; + return [$title, $extractedText, $linksResolved, $crawler]; } private function resolveAndValidateLink(string $href, string $finalUrl): ?string @@ -159,4 +168,27 @@ private function resolveAndValidateLink(string $href, string $finalUrl): ?string return $resolved; } + + /** + * @return array{0: ?string, 1: ?float} + */ + private function detectLanguage(Crawler $crawler, string $extractedText, ?int $wordCount = null): array + { + if ($wordCount >= self::MIN_WORDS_FOR_TEXT_DETECTION) { + $result = $this->languageDetection->detect($extractedText); + if ($result !== null && $result[1] >= self::MIN_TEXT_DETECTION_CONFIDENCE) { + return [$result[0], $result[1]]; + } + } + + $lang = $crawler->filter('html')->count() > 0 + ? trim($crawler->filter('html')->attr('lang') ?? '') + : ''; + + if ($lang !== '' && strlen($lang) <= 35) { + return [$lang, 1.0]; + } + + return [null, null]; + } } diff --git a/tests/Feature/Actions/FetchPageActionTest.php b/tests/Feature/Actions/FetchPageActionTest.php index b5f415a..826c755 100644 --- a/tests/Feature/Actions/FetchPageActionTest.php +++ b/tests/Feature/Actions/FetchPageActionTest.php @@ -6,6 +6,7 @@ use App\Actions\FetchPageAction; use App\Enums\CrawlOutcomeEnum; +use App\Services\LanguageDetectionService; use App\ValueObjects\FetchResult; use GuzzleHttp\Exception\ConnectException; use GuzzleHttp\Psr7\Request; @@ -323,6 +324,186 @@ public function test_fragment_only_href_is_filtered_from_outbound_links(): void $this->assertSame(0, $result->outboundLinks->count()); } + public function test_sufficient_text_triggers_language_detection_and_result_propagates(): void + { + // 24 words — above the detection threshold + $body = <<<'HTML' + + +
The quick brown fox jumps over the lazy dog and then runs away into the forest + where many other animals live and play together every single day.
+Too short to detect language automatically.
+Too short to detect language automatically.
+Too short to detect language automatically.
+Too short to detect language automatically.
+The quick brown fox jumps over the lazy dog and then runs away into the forest + where many other animals live and play together every single day.
+