13 - Make page language sticky across re-crawls when new fetch returns null
This commit is contained in:
parent
1cba8f3fc9
commit
a59c086da2
2 changed files with 45 additions and 3 deletions
|
|
@ -84,8 +84,14 @@ private function updatePageStatus(FetchResult $result): void
|
|||
'status' => $status,
|
||||
'fetched_at' => now(),
|
||||
'title' => $result->title,
|
||||
'language' => $result->language,
|
||||
'language_confidence' => $result->languageConfidence,
|
||||
// Sticky language: only write when detection produced a value, so a re-crawl
|
||||
// returning null doesn't erase a previously-detected language. Guarding on
|
||||
// language alone is sufficient because FetchPageAction::detectLanguage()
|
||||
// always returns the pair as both-null or both-non-null (never mixed).
|
||||
...($result->language !== null ? [
|
||||
'language' => $result->language,
|
||||
'language_confidence' => $result->languageConfidence,
|
||||
] : []),
|
||||
],
|
||||
PageStatusEnum::Failed => [
|
||||
'status' => $status,
|
||||
|
|
|
|||
|
|
@ -482,7 +482,43 @@ public function test_handle_persists_language_on_success(): void
|
|||
$this->assertEqualsWithDelta(0.95, $fresh->language_confidence, 0.001);
|
||||
}
|
||||
|
||||
public function test_handle_persists_null_language_on_success(): void
|
||||
public function test_handle_does_not_overwrite_existing_language_when_new_fetch_returns_null(): void
|
||||
{
|
||||
Queue::fake();
|
||||
|
||||
$this->mockFetchPageAction(
|
||||
CrawlOutcomeEnum::Success,
|
||||
statusCode: 200,
|
||||
title: 'Hello',
|
||||
extractedText: 'hi',
|
||||
wordCount: 1,
|
||||
language: null,
|
||||
languageConfidence: null,
|
||||
);
|
||||
|
||||
// Page already has a language from a previous fetch
|
||||
$page = Page::factory()->createQuietly([
|
||||
'url' => 'https://example.com/article',
|
||||
'language' => 'en',
|
||||
'language_confidence' => 0.95,
|
||||
]);
|
||||
$crawl = PageCrawl::factory()->page($page)->createQuietly();
|
||||
|
||||
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
|
||||
->handle();
|
||||
|
||||
$fresh = $page->fresh();
|
||||
|
||||
// Language columns must be sticky — null detection must NOT overwrite them
|
||||
$this->assertSame('en', $fresh->language);
|
||||
$this->assertEqualsWithDelta(0.95, $fresh->language_confidence, 0.001);
|
||||
|
||||
// Other columns must still update — sticky applies to language only
|
||||
$this->assertSame(PageStatusEnum::Fetched, $fresh->status);
|
||||
$this->assertSame('Hello', $fresh->title);
|
||||
}
|
||||
|
||||
public function test_handle_leaves_language_null_when_no_prior_and_no_detection(): void
|
||||
{
|
||||
Queue::fake();
|
||||
|
||||
|
|
|
|||
Loading…
Reference in a new issue