13 - Wire LanguageDetectionService into FetchPageAction with lang attr fallback
This commit is contained in:
parent
cb83b0df90
commit
81b3c7f70b
2 changed files with 215 additions and 2 deletions
|
|
@ -5,6 +5,7 @@
|
||||||
namespace App\Actions;
|
namespace App\Actions;
|
||||||
|
|
||||||
use App\Enums\CrawlOutcomeEnum;
|
use App\Enums\CrawlOutcomeEnum;
|
||||||
|
use App\Services\LanguageDetectionService;
|
||||||
use App\Services\UrlService;
|
use App\Services\UrlService;
|
||||||
use App\ValueObjects\FetchResult;
|
use App\ValueObjects\FetchResult;
|
||||||
use fivefilters\Readability\Configuration;
|
use fivefilters\Readability\Configuration;
|
||||||
|
|
@ -20,9 +21,14 @@
|
||||||
|
|
||||||
class FetchPageAction
|
class FetchPageAction
|
||||||
{
|
{
|
||||||
|
private const MIN_WORDS_FOR_TEXT_DETECTION = 20;
|
||||||
|
|
||||||
|
private const MIN_TEXT_DETECTION_CONFIDENCE = 0.30;
|
||||||
|
|
||||||
public function __construct(
|
public function __construct(
|
||||||
private Factory $http,
|
private Factory $http,
|
||||||
private UrlService $urlService,
|
private UrlService $urlService,
|
||||||
|
private LanguageDetectionService $languageDetection,
|
||||||
) {}
|
) {}
|
||||||
|
|
||||||
public function __invoke(string $url): FetchResult
|
public function __invoke(string $url): FetchResult
|
||||||
|
|
@ -46,8 +52,9 @@ public function __invoke(string $url): FetchResult
|
||||||
[$outcome, $error] = $this->validateResponse($response);
|
[$outcome, $error] = $this->validateResponse($response);
|
||||||
|
|
||||||
if ($outcome === CrawlOutcomeEnum::Success) {
|
if ($outcome === CrawlOutcomeEnum::Success) {
|
||||||
[$title, $extractedText, $links] = $this->extractTitleTextAndLinks($response->body(), $url);
|
[$title, $extractedText, $links, $crawler] = $this->extractTitleTextAndLinks($response->body(), $url);
|
||||||
$wordCount = $extractedText !== '' ? count(preg_split('/\s+/u', trim($extractedText))) : 0;
|
$wordCount = $extractedText !== '' ? count(preg_split('/\s+/u', trim($extractedText))) : 0;
|
||||||
|
[$language, $languageConfidence] = $this->detectLanguage($crawler, $extractedText, $wordCount);
|
||||||
}
|
}
|
||||||
|
|
||||||
return new FetchResult(
|
return new FetchResult(
|
||||||
|
|
@ -59,6 +66,8 @@ public function __invoke(string $url): FetchResult
|
||||||
outboundLinks: $links ?? collect(),
|
outboundLinks: $links ?? collect(),
|
||||||
wordCount: $wordCount ?? null,
|
wordCount: $wordCount ?? null,
|
||||||
errorMessage: $error ?? null,
|
errorMessage: $error ?? null,
|
||||||
|
language: $language ?? null,
|
||||||
|
languageConfidence: $languageConfidence ?? null,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -135,7 +144,7 @@ private function extractTitleTextAndLinks(string $body, string $url): array
|
||||||
->unique()
|
->unique()
|
||||||
->values();
|
->values();
|
||||||
|
|
||||||
return [$title, $extractedText, $linksResolved];
|
return [$title, $extractedText, $linksResolved, $crawler];
|
||||||
}
|
}
|
||||||
|
|
||||||
private function resolveAndValidateLink(string $href, string $finalUrl): ?string
|
private function resolveAndValidateLink(string $href, string $finalUrl): ?string
|
||||||
|
|
@ -159,4 +168,27 @@ private function resolveAndValidateLink(string $href, string $finalUrl): ?string
|
||||||
|
|
||||||
return $resolved;
|
return $resolved;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @return array{0: ?string, 1: ?float}
|
||||||
|
*/
|
||||||
|
private function detectLanguage(Crawler $crawler, string $extractedText, ?int $wordCount = null): array
|
||||||
|
{
|
||||||
|
if ($wordCount >= self::MIN_WORDS_FOR_TEXT_DETECTION) {
|
||||||
|
$result = $this->languageDetection->detect($extractedText);
|
||||||
|
if ($result !== null && $result[1] >= self::MIN_TEXT_DETECTION_CONFIDENCE) {
|
||||||
|
return [$result[0], $result[1]];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
$lang = $crawler->filter('html')->count() > 0
|
||||||
|
? trim($crawler->filter('html')->attr('lang') ?? '')
|
||||||
|
: '';
|
||||||
|
|
||||||
|
if ($lang !== '' && strlen($lang) <= 35) {
|
||||||
|
return [$lang, 1.0];
|
||||||
|
}
|
||||||
|
|
||||||
|
return [null, null];
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -6,6 +6,7 @@
|
||||||
|
|
||||||
use App\Actions\FetchPageAction;
|
use App\Actions\FetchPageAction;
|
||||||
use App\Enums\CrawlOutcomeEnum;
|
use App\Enums\CrawlOutcomeEnum;
|
||||||
|
use App\Services\LanguageDetectionService;
|
||||||
use App\ValueObjects\FetchResult;
|
use App\ValueObjects\FetchResult;
|
||||||
use GuzzleHttp\Exception\ConnectException;
|
use GuzzleHttp\Exception\ConnectException;
|
||||||
use GuzzleHttp\Psr7\Request;
|
use GuzzleHttp\Psr7\Request;
|
||||||
|
|
@ -323,6 +324,186 @@ public function test_fragment_only_href_is_filtered_from_outbound_links(): void
|
||||||
$this->assertSame(0, $result->outboundLinks->count());
|
$this->assertSame(0, $result->outboundLinks->count());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public function test_sufficient_text_triggers_language_detection_and_result_propagates(): void
|
||||||
|
{
|
||||||
|
// 24 words — above the detection threshold
|
||||||
|
$body = <<<'HTML'
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<head><title>Language Detection Test</title></head>
|
||||||
|
<body>
|
||||||
|
<article>
|
||||||
|
<p>The quick brown fox jumps over the lazy dog and then runs away into the forest
|
||||||
|
where many other animals live and play together every single day.</p>
|
||||||
|
</article>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
HTML;
|
||||||
|
|
||||||
|
Http::fake([
|
||||||
|
'example.com/*' => Http::response($body, 200, ['Content-Type' => 'text/html']),
|
||||||
|
]);
|
||||||
|
|
||||||
|
$detection = $this->mock(LanguageDetectionService::class);
|
||||||
|
$detection->shouldReceive('detect')
|
||||||
|
->once()
|
||||||
|
->andReturn(['en', 0.95]);
|
||||||
|
|
||||||
|
$result = $this->makeAction()('https://example.com/article');
|
||||||
|
|
||||||
|
$this->assertSame(CrawlOutcomeEnum::Success, $result->outcome);
|
||||||
|
$this->assertSame('en', $result->language);
|
||||||
|
$this->assertSame(0.95, $result->languageConfidence);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function test_short_body_with_html_lang_attr_skips_service_and_uses_lang_attr(): void
|
||||||
|
{
|
||||||
|
// 7 words — below the detection threshold
|
||||||
|
$html = <<<'HTML'
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html lang="pt-BR">
|
||||||
|
<head><title>Short Page</title></head>
|
||||||
|
<body>
|
||||||
|
<article>
|
||||||
|
<p>Too short to detect language automatically.</p>
|
||||||
|
</article>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
HTML;
|
||||||
|
|
||||||
|
Http::fake([
|
||||||
|
'example.com/*' => Http::response($html, 200, ['Content-Type' => 'text/html']),
|
||||||
|
]);
|
||||||
|
|
||||||
|
$detection = $this->mock(LanguageDetectionService::class);
|
||||||
|
$detection->shouldReceive('detect')->never();
|
||||||
|
|
||||||
|
$result = $this->makeAction()('https://example.com/article');
|
||||||
|
|
||||||
|
$this->assertSame(CrawlOutcomeEnum::Success, $result->outcome);
|
||||||
|
$this->assertSame('pt-BR', $result->language);
|
||||||
|
$this->assertSame(1.0, $result->languageConfidence);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function test_short_body_with_no_lang_attr_returns_null_language(): void
|
||||||
|
{
|
||||||
|
// 7 words — below the detection threshold
|
||||||
|
$html = <<<'HTML'
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<head><title>Short Page</title></head>
|
||||||
|
<body>
|
||||||
|
<article>
|
||||||
|
<p>Too short to detect language automatically.</p>
|
||||||
|
</article>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
HTML;
|
||||||
|
|
||||||
|
Http::fake([
|
||||||
|
'example.com/*' => Http::response($html, 200, ['Content-Type' => 'text/html']),
|
||||||
|
]);
|
||||||
|
|
||||||
|
$detection = $this->mock(LanguageDetectionService::class);
|
||||||
|
$detection->shouldReceive('detect')->never();
|
||||||
|
|
||||||
|
$result = $this->makeAction()('https://example.com/article');
|
||||||
|
|
||||||
|
$this->assertSame(CrawlOutcomeEnum::Success, $result->outcome);
|
||||||
|
$this->assertNull($result->language);
|
||||||
|
$this->assertNull($result->languageConfidence);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function test_whitespace_only_lang_attr_is_treated_as_absent(): void
|
||||||
|
{
|
||||||
|
// 7 words — below the detection threshold; lang attr is blank/whitespace-only
|
||||||
|
$html = <<<'HTML'
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html lang=" ">
|
||||||
|
<head><title>Short Page</title></head>
|
||||||
|
<body>
|
||||||
|
<article>
|
||||||
|
<p>Too short to detect language automatically.</p>
|
||||||
|
</article>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
HTML;
|
||||||
|
|
||||||
|
Http::fake([
|
||||||
|
'example.com/*' => Http::response($html, 200, ['Content-Type' => 'text/html']),
|
||||||
|
]);
|
||||||
|
|
||||||
|
$detection = $this->mock(LanguageDetectionService::class);
|
||||||
|
$detection->shouldReceive('detect')->never();
|
||||||
|
|
||||||
|
$result = $this->makeAction()('https://example.com/article');
|
||||||
|
|
||||||
|
$this->assertSame(CrawlOutcomeEnum::Success, $result->outcome);
|
||||||
|
$this->assertNull($result->language);
|
||||||
|
$this->assertNull($result->languageConfidence);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function test_lang_attr_longer_than_35_chars_is_rejected(): void
|
||||||
|
{
|
||||||
|
// 7 words — below the detection threshold; lang attr exceeds BCP-47 column width (string(35))
|
||||||
|
$html = <<<'HTML'
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html lang="aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa-bbb">
|
||||||
|
<head><title>Short Page</title></head>
|
||||||
|
<body>
|
||||||
|
<article>
|
||||||
|
<p>Too short to detect language automatically.</p>
|
||||||
|
</article>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
HTML;
|
||||||
|
|
||||||
|
Http::fake([
|
||||||
|
'example.com/*' => Http::response($html, 200, ['Content-Type' => 'text/html']),
|
||||||
|
]);
|
||||||
|
|
||||||
|
$detection = $this->mock(LanguageDetectionService::class);
|
||||||
|
$detection->shouldReceive('detect')->never();
|
||||||
|
|
||||||
|
$result = $this->makeAction()('https://example.com/article');
|
||||||
|
|
||||||
|
$this->assertSame(CrawlOutcomeEnum::Success, $result->outcome);
|
||||||
|
$this->assertNull($result->language);
|
||||||
|
$this->assertNull($result->languageConfidence);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function test_low_confidence_detection_falls_through_to_lang_attr(): void
|
||||||
|
{
|
||||||
|
// 24 words — above the detection threshold; service returns low-confidence result
|
||||||
|
$html = <<<'HTML'
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html lang="en-US">
|
||||||
|
<head><title>Confidence Floor Test</title></head>
|
||||||
|
<body>
|
||||||
|
<article>
|
||||||
|
<p>The quick brown fox jumps over the lazy dog and then runs away into the forest
|
||||||
|
where many other animals live and play together every single day.</p>
|
||||||
|
</article>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
HTML;
|
||||||
|
|
||||||
|
Http::fake([
|
||||||
|
'example.com/*' => Http::response($html, 200, ['Content-Type' => 'text/html']),
|
||||||
|
]);
|
||||||
|
|
||||||
|
$detection = $this->mock(LanguageDetectionService::class);
|
||||||
|
$detection->shouldReceive('detect')
|
||||||
|
->once()
|
||||||
|
->andReturn(['xx', 0.15]);
|
||||||
|
|
||||||
|
$result = $this->makeAction()('https://example.com/article');
|
||||||
|
|
||||||
|
$this->assertSame(CrawlOutcomeEnum::Success, $result->outcome);
|
||||||
|
$this->assertSame('en-US', $result->language);
|
||||||
|
$this->assertSame(1.0, $result->languageConfidence);
|
||||||
|
}
|
||||||
|
|
||||||
private function makeAction(): FetchPageAction
|
private function makeAction(): FetchPageAction
|
||||||
{
|
{
|
||||||
return app(FetchPageAction::class);
|
return app(FetchPageAction::class);
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue