13 - Wire LanguageDetectionService into FetchPageAction with lang attr fallback

This commit is contained in:
myrmidex 2026-04-28 01:09:18 +02:00
parent cb83b0df90
commit 81b3c7f70b
2 changed files with 215 additions and 2 deletions

View file

@ -5,6 +5,7 @@
namespace App\Actions;
use App\Enums\CrawlOutcomeEnum;
use App\Services\LanguageDetectionService;
use App\Services\UrlService;
use App\ValueObjects\FetchResult;
use fivefilters\Readability\Configuration;
@ -20,9 +21,14 @@
class FetchPageAction
{
private const MIN_WORDS_FOR_TEXT_DETECTION = 20;
private const MIN_TEXT_DETECTION_CONFIDENCE = 0.30;
public function __construct(
private Factory $http,
private UrlService $urlService,
private LanguageDetectionService $languageDetection,
) {}
public function __invoke(string $url): FetchResult
@ -46,8 +52,9 @@ public function __invoke(string $url): FetchResult
[$outcome, $error] = $this->validateResponse($response);
if ($outcome === CrawlOutcomeEnum::Success) {
[$title, $extractedText, $links] = $this->extractTitleTextAndLinks($response->body(), $url);
[$title, $extractedText, $links, $crawler] = $this->extractTitleTextAndLinks($response->body(), $url);
$wordCount = $extractedText !== '' ? count(preg_split('/\s+/u', trim($extractedText))) : 0;
[$language, $languageConfidence] = $this->detectLanguage($crawler, $extractedText, $wordCount);
}
return new FetchResult(
@ -59,6 +66,8 @@ public function __invoke(string $url): FetchResult
outboundLinks: $links ?? collect(),
wordCount: $wordCount ?? null,
errorMessage: $error ?? null,
language: $language ?? null,
languageConfidence: $languageConfidence ?? null,
);
}
@ -135,7 +144,7 @@ private function extractTitleTextAndLinks(string $body, string $url): array
->unique()
->values();
return [$title, $extractedText, $linksResolved];
return [$title, $extractedText, $linksResolved, $crawler];
}
private function resolveAndValidateLink(string $href, string $finalUrl): ?string
@ -159,4 +168,27 @@ private function resolveAndValidateLink(string $href, string $finalUrl): ?string
return $resolved;
}
/**
* @return array{0: ?string, 1: ?float}
*/
private function detectLanguage(Crawler $crawler, string $extractedText, ?int $wordCount = null): array
{
if ($wordCount >= self::MIN_WORDS_FOR_TEXT_DETECTION) {
$result = $this->languageDetection->detect($extractedText);
if ($result !== null && $result[1] >= self::MIN_TEXT_DETECTION_CONFIDENCE) {
return [$result[0], $result[1]];
}
}
$lang = $crawler->filter('html')->count() > 0
? trim($crawler->filter('html')->attr('lang') ?? '')
: '';
if ($lang !== '' && strlen($lang) <= 35) {
return [$lang, 1.0];
}
return [null, null];
}
}

View file

@ -6,6 +6,7 @@
use App\Actions\FetchPageAction;
use App\Enums\CrawlOutcomeEnum;
use App\Services\LanguageDetectionService;
use App\ValueObjects\FetchResult;
use GuzzleHttp\Exception\ConnectException;
use GuzzleHttp\Psr7\Request;
@ -323,6 +324,186 @@ public function test_fragment_only_href_is_filtered_from_outbound_links(): void
$this->assertSame(0, $result->outboundLinks->count());
}
public function test_sufficient_text_triggers_language_detection_and_result_propagates(): void
{
// 24 words — above the detection threshold
$body = <<<'HTML'
<!DOCTYPE html>
<html>
<head><title>Language Detection Test</title></head>
<body>
<article>
<p>The quick brown fox jumps over the lazy dog and then runs away into the forest
where many other animals live and play together every single day.</p>
</article>
</body>
</html>
HTML;
Http::fake([
'example.com/*' => Http::response($body, 200, ['Content-Type' => 'text/html']),
]);
$detection = $this->mock(LanguageDetectionService::class);
$detection->shouldReceive('detect')
->once()
->andReturn(['en', 0.95]);
$result = $this->makeAction()('https://example.com/article');
$this->assertSame(CrawlOutcomeEnum::Success, $result->outcome);
$this->assertSame('en', $result->language);
$this->assertSame(0.95, $result->languageConfidence);
}
public function test_short_body_with_html_lang_attr_skips_service_and_uses_lang_attr(): void
{
// 7 words — below the detection threshold
$html = <<<'HTML'
<!DOCTYPE html>
<html lang="pt-BR">
<head><title>Short Page</title></head>
<body>
<article>
<p>Too short to detect language automatically.</p>
</article>
</body>
</html>
HTML;
Http::fake([
'example.com/*' => Http::response($html, 200, ['Content-Type' => 'text/html']),
]);
$detection = $this->mock(LanguageDetectionService::class);
$detection->shouldReceive('detect')->never();
$result = $this->makeAction()('https://example.com/article');
$this->assertSame(CrawlOutcomeEnum::Success, $result->outcome);
$this->assertSame('pt-BR', $result->language);
$this->assertSame(1.0, $result->languageConfidence);
}
public function test_short_body_with_no_lang_attr_returns_null_language(): void
{
// 7 words — below the detection threshold
$html = <<<'HTML'
<!DOCTYPE html>
<html>
<head><title>Short Page</title></head>
<body>
<article>
<p>Too short to detect language automatically.</p>
</article>
</body>
</html>
HTML;
Http::fake([
'example.com/*' => Http::response($html, 200, ['Content-Type' => 'text/html']),
]);
$detection = $this->mock(LanguageDetectionService::class);
$detection->shouldReceive('detect')->never();
$result = $this->makeAction()('https://example.com/article');
$this->assertSame(CrawlOutcomeEnum::Success, $result->outcome);
$this->assertNull($result->language);
$this->assertNull($result->languageConfidence);
}
public function test_whitespace_only_lang_attr_is_treated_as_absent(): void
{
// 7 words — below the detection threshold; lang attr is blank/whitespace-only
$html = <<<'HTML'
<!DOCTYPE html>
<html lang=" ">
<head><title>Short Page</title></head>
<body>
<article>
<p>Too short to detect language automatically.</p>
</article>
</body>
</html>
HTML;
Http::fake([
'example.com/*' => Http::response($html, 200, ['Content-Type' => 'text/html']),
]);
$detection = $this->mock(LanguageDetectionService::class);
$detection->shouldReceive('detect')->never();
$result = $this->makeAction()('https://example.com/article');
$this->assertSame(CrawlOutcomeEnum::Success, $result->outcome);
$this->assertNull($result->language);
$this->assertNull($result->languageConfidence);
}
public function test_lang_attr_longer_than_35_chars_is_rejected(): void
{
// 7 words — below the detection threshold; lang attr exceeds BCP-47 column width (string(35))
$html = <<<'HTML'
<!DOCTYPE html>
<html lang="aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa-bbb">
<head><title>Short Page</title></head>
<body>
<article>
<p>Too short to detect language automatically.</p>
</article>
</body>
</html>
HTML;
Http::fake([
'example.com/*' => Http::response($html, 200, ['Content-Type' => 'text/html']),
]);
$detection = $this->mock(LanguageDetectionService::class);
$detection->shouldReceive('detect')->never();
$result = $this->makeAction()('https://example.com/article');
$this->assertSame(CrawlOutcomeEnum::Success, $result->outcome);
$this->assertNull($result->language);
$this->assertNull($result->languageConfidence);
}
public function test_low_confidence_detection_falls_through_to_lang_attr(): void
{
// 24 words — above the detection threshold; service returns low-confidence result
$html = <<<'HTML'
<!DOCTYPE html>
<html lang="en-US">
<head><title>Confidence Floor Test</title></head>
<body>
<article>
<p>The quick brown fox jumps over the lazy dog and then runs away into the forest
where many other animals live and play together every single day.</p>
</article>
</body>
</html>
HTML;
Http::fake([
'example.com/*' => Http::response($html, 200, ['Content-Type' => 'text/html']),
]);
$detection = $this->mock(LanguageDetectionService::class);
$detection->shouldReceive('detect')
->once()
->andReturn(['xx', 0.15]);
$result = $this->makeAction()('https://example.com/article');
$this->assertSame(CrawlOutcomeEnum::Success, $result->outcome);
$this->assertSame('en-US', $result->language);
$this->assertSame(1.0, $result->languageConfidence);
}
private function makeAction(): FetchPageAction
{
return app(FetchPageAction::class);