From 3297c4bb3bfecdbd5499dbe98f6a89adab75cc10 Mon Sep 17 00:00:00 2001 From: myrmidex Date: Mon, 27 Apr 2026 00:18:34 +0200 Subject: [PATCH] 14 - Fix ProcessCrawlJob outcome write and status mapping bugs --- app/Jobs/ProcessCrawlJob.php | 41 ++++-- tests/Feature/Jobs/ProcessCrawlJobTest.php | 138 +++++++++++++++++++++ 2 files changed, 172 insertions(+), 7 deletions(-) diff --git a/app/Jobs/ProcessCrawlJob.php b/app/Jobs/ProcessCrawlJob.php index a8b4513..11a2993 100644 --- a/app/Jobs/ProcessCrawlJob.php +++ b/app/Jobs/ProcessCrawlJob.php @@ -26,11 +26,7 @@ public function handle( /** @var FetchResult $result */ $result = $fetcher($this->pageCrawl->page->url); - $this->pageCrawl->update([ - 'outcome' => CrawlOutcomeEnum::Success, - 'completed_at' => now(), - 'status_code' => 200, - ]); + $this->updatePageCrawl($result); $update = match ($result->outcome) { CrawlOutcomeEnum::Rejected => [ @@ -41,10 +37,19 @@ public function handle( 'status' => PageStatusEnum::Failed, 'failed_at' => now(), ], + CrawlOutcomeEnum::Failed => [ + 'status' => PageStatusEnum::Failed, + ], CrawlOutcomeEnum::Blocked4xx => [ 'status' => PageStatusEnum::Failed, 'failed_at' => now(), ], + CrawlOutcomeEnum::Blocked5xx => [ + 'status' => PageStatusEnum::Failed, + ], + CrawlOutcomeEnum::BlockedRobots => [ + 'status' => PageStatusEnum::Failed, + ], default => [ 'status' => PageStatusEnum::Fetched, 'fetched_at' => now(), @@ -52,10 +57,12 @@ public function handle( ], }; - $result->outboundLinks->each(fn (string $url) => $register($url)); - $this->pageCrawl->page->update($update); + if ($result->outcome !== CrawlOutcomeEnum::Failed) { + $result->outboundLinks->each(fn (string $url) => $register($url)); + } + if (in_array($result->outcome, [ CrawlOutcomeEnum::Failed, CrawlOutcomeEnum::Timeout, @@ -81,4 +88,24 @@ private function scheduleRetryIfNeeded(FetchResult $result, PageCrawl $crawl): v ProcessCrawlJob::dispatch($newRow)->delay(now()->addHour()); } + + public function updatePageCrawl(FetchResult $result): void + { + $outcome = CrawlOutcomeEnum::Success; + $errorMessage = null; + $statusCode = 200; + + if ($result->outcome === CrawlOutcomeEnum::Failed) { + $outcome = CrawlOutcomeEnum::Failed; + $errorMessage = $result->errorMessage; + $statusCode = null; + } + + $this->pageCrawl->update([ + 'outcome' => $outcome, + 'completed_at' => now(), + 'status_code' => $statusCode, + 'error_message' => $errorMessage, + ]); + } } diff --git a/tests/Feature/Jobs/ProcessCrawlJobTest.php b/tests/Feature/Jobs/ProcessCrawlJobTest.php index 8503089..dab484b 100644 --- a/tests/Feature/Jobs/ProcessCrawlJobTest.php +++ b/tests/Feature/Jobs/ProcessCrawlJobTest.php @@ -266,6 +266,144 @@ public function test_handle_does_not_retry_after_three_attempts(): void Queue::assertNotPushed(ProcessCrawlJob::class); } + public function test_handle_writes_failed_outcome_to_page_crawl(): void + { + Queue::fake(); + + $fetcher = Mockery::mock(FetchPageAction::class); + $fetcher->shouldReceive('__invoke')->andReturn(new FetchResult( + outcome: CrawlOutcomeEnum::Failed, + statusCode: null, + finalUrl: null, + title: null, + extractedText: null, + outboundLinks: collect(), + wordCount: null, + errorMessage: 'boom', + )); + $this->app->instance(FetchPageAction::class, $fetcher); + + $page = Page::factory()->createQuietly(['url' => 'https://example.com/unstable']); + $crawl = PageCrawl::factory()->page($page)->createQuietly(); + + app(ProcessCrawlJob::class, ['pageCrawl' => $crawl]) + ->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class)); + + $this->assertDatabaseHas('page_crawls', [ + 'id' => $crawl->id, + 'outcome' => CrawlOutcomeEnum::Failed->value, + 'status_code' => null, + 'error_message' => 'boom', + ]); + } + + public function test_handle_updates_page_to_failed_on_failed_outcome(): void + { + Queue::fake(); + + $fetcher = Mockery::mock(FetchPageAction::class); + $fetcher->shouldReceive('__invoke')->andReturn(new FetchResult( + outcome: CrawlOutcomeEnum::Failed, + statusCode: null, + finalUrl: null, + title: null, + extractedText: null, + outboundLinks: collect(), + wordCount: null, + errorMessage: 'Connection refused', + )); + $this->app->instance(FetchPageAction::class, $fetcher); + + $page = Page::factory()->createQuietly(['url' => 'https://example.com/unreachable']); + $crawl = PageCrawl::factory()->page($page)->createQuietly(); + + app(ProcessCrawlJob::class, ['pageCrawl' => $crawl]) + ->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class)); + + $this->assertSame(PageStatusEnum::Failed, $page->fresh()->status); + } + + public function test_handle_updates_page_to_failed_on_blocked_5xx(): void + { + Queue::fake(); + + $fetcher = Mockery::mock(FetchPageAction::class); + $fetcher->shouldReceive('__invoke')->andReturn(new FetchResult( + outcome: CrawlOutcomeEnum::Blocked5xx, + statusCode: 503, + finalUrl: null, + title: null, + extractedText: null, + outboundLinks: collect(), + wordCount: null, + errorMessage: 'HTTP 503', + )); + $this->app->instance(FetchPageAction::class, $fetcher); + + $page = Page::factory()->createQuietly(['url' => 'https://example.com/overloaded']); + $crawl = PageCrawl::factory()->page($page)->createQuietly(); + + app(ProcessCrawlJob::class, ['pageCrawl' => $crawl]) + ->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class)); + + $this->assertSame(PageStatusEnum::Failed, $page->fresh()->status); + } + + public function test_handle_updates_page_to_failed_on_blocked_robots(): void + { + Queue::fake(); + + $fetcher = Mockery::mock(FetchPageAction::class); + $fetcher->shouldReceive('__invoke')->andReturn(new FetchResult( + outcome: CrawlOutcomeEnum::BlockedRobots, + statusCode: null, + finalUrl: null, + title: null, + extractedText: null, + outboundLinks: collect(), + wordCount: null, + errorMessage: 'Disallowed by robots.txt', + )); + $this->app->instance(FetchPageAction::class, $fetcher); + + $page = Page::factory()->createQuietly(['url' => 'https://example.com/private']); + $crawl = PageCrawl::factory()->page($page)->createQuietly(); + + app(ProcessCrawlJob::class, ['pageCrawl' => $crawl]) + ->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class)); + + $this->assertSame(PageStatusEnum::Failed, $page->fresh()->status); + } + + public function test_handle_does_not_register_outbound_links_on_failure(): void + { + Queue::fake(); + + $fetcher = Mockery::mock(FetchPageAction::class); + $fetcher->shouldReceive('__invoke')->andReturn(new FetchResult( + outcome: CrawlOutcomeEnum::Failed, + statusCode: null, + finalUrl: null, + title: null, + extractedText: null, + outboundLinks: collect([ + 'https://should-not-be-registered.com/page', + ]), + wordCount: null, + errorMessage: 'Connection refused', + )); + $this->app->instance(FetchPageAction::class, $fetcher); + + $page = Page::factory()->createQuietly(['url' => 'https://example.com/broken']); + $crawl = PageCrawl::factory()->page($page)->createQuietly(); + + app(ProcessCrawlJob::class, ['pageCrawl' => $crawl]) + ->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class)); + + $this->assertDatabaseMissing('pages', ['url' => 'https://should-not-be-registered.com/page']); + $this->assertSame(1, Page::count()); + } + public function test_handle_registers_outbound_links_on_success(): void { Queue::fake();