From a59c086da2b2920e73db0a3b31c8f1885dcdef4e Mon Sep 17 00:00:00 2001 From: myrmidex Date: Tue, 28 Apr 2026 01:32:38 +0200 Subject: [PATCH] 13 - Make page language sticky across re-crawls when new fetch returns null --- app/Jobs/ProcessCrawlJob.php | 10 ++++-- tests/Feature/Jobs/ProcessCrawlJobTest.php | 38 +++++++++++++++++++++- 2 files changed, 45 insertions(+), 3 deletions(-) diff --git a/app/Jobs/ProcessCrawlJob.php b/app/Jobs/ProcessCrawlJob.php index 071bd49..2c15b0c 100644 --- a/app/Jobs/ProcessCrawlJob.php +++ b/app/Jobs/ProcessCrawlJob.php @@ -84,8 +84,14 @@ private function updatePageStatus(FetchResult $result): void 'status' => $status, 'fetched_at' => now(), 'title' => $result->title, - 'language' => $result->language, - 'language_confidence' => $result->languageConfidence, + // Sticky language: only write when detection produced a value, so a re-crawl + // returning null doesn't erase a previously-detected language. Guarding on + // language alone is sufficient because FetchPageAction::detectLanguage() + // always returns the pair as both-null or both-non-null (never mixed). + ...($result->language !== null ? [ + 'language' => $result->language, + 'language_confidence' => $result->languageConfidence, + ] : []), ], PageStatusEnum::Failed => [ 'status' => $status, diff --git a/tests/Feature/Jobs/ProcessCrawlJobTest.php b/tests/Feature/Jobs/ProcessCrawlJobTest.php index bf353e6..4f07f80 100644 --- a/tests/Feature/Jobs/ProcessCrawlJobTest.php +++ b/tests/Feature/Jobs/ProcessCrawlJobTest.php @@ -482,7 +482,43 @@ public function test_handle_persists_language_on_success(): void $this->assertEqualsWithDelta(0.95, $fresh->language_confidence, 0.001); } - public function test_handle_persists_null_language_on_success(): void + public function test_handle_does_not_overwrite_existing_language_when_new_fetch_returns_null(): void + { + Queue::fake(); + + $this->mockFetchPageAction( + CrawlOutcomeEnum::Success, + statusCode: 200, + title: 'Hello', + extractedText: 'hi', + wordCount: 1, + language: null, + languageConfidence: null, + ); + + // Page already has a language from a previous fetch + $page = Page::factory()->createQuietly([ + 'url' => 'https://example.com/article', + 'language' => 'en', + 'language_confidence' => 0.95, + ]); + $crawl = PageCrawl::factory()->page($page)->createQuietly(); + + app(ProcessCrawlJob::class, ['pageCrawl' => $crawl]) + ->handle(); + + $fresh = $page->fresh(); + + // Language columns must be sticky — null detection must NOT overwrite them + $this->assertSame('en', $fresh->language); + $this->assertEqualsWithDelta(0.95, $fresh->language_confidence, 0.001); + + // Other columns must still update — sticky applies to language only + $this->assertSame(PageStatusEnum::Fetched, $fresh->status); + $this->assertSame('Hello', $fresh->title); + } + + public function test_handle_leaves_language_null_when_no_prior_and_no_detection(): void { Queue::fake();