13 - Make page language sticky across re-crawls when new fetch returns null
This commit is contained in:
parent
1cba8f3fc9
commit
a59c086da2
2 changed files with 45 additions and 3 deletions
|
|
@ -84,8 +84,14 @@ private function updatePageStatus(FetchResult $result): void
|
||||||
'status' => $status,
|
'status' => $status,
|
||||||
'fetched_at' => now(),
|
'fetched_at' => now(),
|
||||||
'title' => $result->title,
|
'title' => $result->title,
|
||||||
'language' => $result->language,
|
// Sticky language: only write when detection produced a value, so a re-crawl
|
||||||
'language_confidence' => $result->languageConfidence,
|
// returning null doesn't erase a previously-detected language. Guarding on
|
||||||
|
// language alone is sufficient because FetchPageAction::detectLanguage()
|
||||||
|
// always returns the pair as both-null or both-non-null (never mixed).
|
||||||
|
...($result->language !== null ? [
|
||||||
|
'language' => $result->language,
|
||||||
|
'language_confidence' => $result->languageConfidence,
|
||||||
|
] : []),
|
||||||
],
|
],
|
||||||
PageStatusEnum::Failed => [
|
PageStatusEnum::Failed => [
|
||||||
'status' => $status,
|
'status' => $status,
|
||||||
|
|
|
||||||
|
|
@ -482,7 +482,43 @@ public function test_handle_persists_language_on_success(): void
|
||||||
$this->assertEqualsWithDelta(0.95, $fresh->language_confidence, 0.001);
|
$this->assertEqualsWithDelta(0.95, $fresh->language_confidence, 0.001);
|
||||||
}
|
}
|
||||||
|
|
||||||
public function test_handle_persists_null_language_on_success(): void
|
public function test_handle_does_not_overwrite_existing_language_when_new_fetch_returns_null(): void
|
||||||
|
{
|
||||||
|
Queue::fake();
|
||||||
|
|
||||||
|
$this->mockFetchPageAction(
|
||||||
|
CrawlOutcomeEnum::Success,
|
||||||
|
statusCode: 200,
|
||||||
|
title: 'Hello',
|
||||||
|
extractedText: 'hi',
|
||||||
|
wordCount: 1,
|
||||||
|
language: null,
|
||||||
|
languageConfidence: null,
|
||||||
|
);
|
||||||
|
|
||||||
|
// Page already has a language from a previous fetch
|
||||||
|
$page = Page::factory()->createQuietly([
|
||||||
|
'url' => 'https://example.com/article',
|
||||||
|
'language' => 'en',
|
||||||
|
'language_confidence' => 0.95,
|
||||||
|
]);
|
||||||
|
$crawl = PageCrawl::factory()->page($page)->createQuietly();
|
||||||
|
|
||||||
|
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
|
||||||
|
->handle();
|
||||||
|
|
||||||
|
$fresh = $page->fresh();
|
||||||
|
|
||||||
|
// Language columns must be sticky — null detection must NOT overwrite them
|
||||||
|
$this->assertSame('en', $fresh->language);
|
||||||
|
$this->assertEqualsWithDelta(0.95, $fresh->language_confidence, 0.001);
|
||||||
|
|
||||||
|
// Other columns must still update — sticky applies to language only
|
||||||
|
$this->assertSame(PageStatusEnum::Fetched, $fresh->status);
|
||||||
|
$this->assertSame('Hello', $fresh->title);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function test_handle_leaves_language_null_when_no_prior_and_no_detection(): void
|
||||||
{
|
{
|
||||||
Queue::fake();
|
Queue::fake();
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue