13 - Make page language sticky across re-crawls when new fetch returns null

This commit is contained in:
myrmidex 2026-04-28 01:32:38 +02:00
parent 1cba8f3fc9
commit a59c086da2
2 changed files with 45 additions and 3 deletions

View file

@ -84,8 +84,14 @@ private function updatePageStatus(FetchResult $result): void
'status' => $status,
'fetched_at' => now(),
'title' => $result->title,
// Sticky language: only write when detection produced a value, so a re-crawl
// returning null doesn't erase a previously-detected language. Guarding on
// language alone is sufficient because FetchPageAction::detectLanguage()
// always returns the pair as both-null or both-non-null (never mixed).
...($result->language !== null ? [
'language' => $result->language,
'language_confidence' => $result->languageConfidence,
] : []),
],
PageStatusEnum::Failed => [
'status' => $status,

View file

@ -482,7 +482,43 @@ public function test_handle_persists_language_on_success(): void
$this->assertEqualsWithDelta(0.95, $fresh->language_confidence, 0.001);
}
public function test_handle_persists_null_language_on_success(): void
public function test_handle_does_not_overwrite_existing_language_when_new_fetch_returns_null(): void
{
Queue::fake();
$this->mockFetchPageAction(
CrawlOutcomeEnum::Success,
statusCode: 200,
title: 'Hello',
extractedText: 'hi',
wordCount: 1,
language: null,
languageConfidence: null,
);
// Page already has a language from a previous fetch
$page = Page::factory()->createQuietly([
'url' => 'https://example.com/article',
'language' => 'en',
'language_confidence' => 0.95,
]);
$crawl = PageCrawl::factory()->page($page)->createQuietly();
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
->handle();
$fresh = $page->fresh();
// Language columns must be sticky — null detection must NOT overwrite them
$this->assertSame('en', $fresh->language);
$this->assertEqualsWithDelta(0.95, $fresh->language_confidence, 0.001);
// Other columns must still update — sticky applies to language only
$this->assertSame(PageStatusEnum::Fetched, $fresh->status);
$this->assertSame('Hello', $fresh->title);
}
public function test_handle_leaves_language_null_when_no_prior_and_no_detection(): void
{
Queue::fake();