14 - Fix ProcessCrawlJob outcome write and status mapping bugs

This commit is contained in:
myrmidex 2026-04-27 00:18:34 +02:00
parent e8a935ea31
commit 3297c4bb3b
2 changed files with 172 additions and 7 deletions

View file

@ -26,11 +26,7 @@ public function handle(
/** @var FetchResult $result */
$result = $fetcher($this->pageCrawl->page->url);
$this->pageCrawl->update([
'outcome' => CrawlOutcomeEnum::Success,
'completed_at' => now(),
'status_code' => 200,
]);
$this->updatePageCrawl($result);
$update = match ($result->outcome) {
CrawlOutcomeEnum::Rejected => [
@ -41,10 +37,19 @@ public function handle(
'status' => PageStatusEnum::Failed,
'failed_at' => now(),
],
CrawlOutcomeEnum::Failed => [
'status' => PageStatusEnum::Failed,
],
CrawlOutcomeEnum::Blocked4xx => [
'status' => PageStatusEnum::Failed,
'failed_at' => now(),
],
CrawlOutcomeEnum::Blocked5xx => [
'status' => PageStatusEnum::Failed,
],
CrawlOutcomeEnum::BlockedRobots => [
'status' => PageStatusEnum::Failed,
],
default => [
'status' => PageStatusEnum::Fetched,
'fetched_at' => now(),
@ -52,10 +57,12 @@ public function handle(
],
};
$result->outboundLinks->each(fn (string $url) => $register($url));
$this->pageCrawl->page->update($update);
if ($result->outcome !== CrawlOutcomeEnum::Failed) {
$result->outboundLinks->each(fn (string $url) => $register($url));
}
if (in_array($result->outcome, [
CrawlOutcomeEnum::Failed,
CrawlOutcomeEnum::Timeout,
@ -81,4 +88,24 @@ private function scheduleRetryIfNeeded(FetchResult $result, PageCrawl $crawl): v
ProcessCrawlJob::dispatch($newRow)->delay(now()->addHour());
}
public function updatePageCrawl(FetchResult $result): void
{
$outcome = CrawlOutcomeEnum::Success;
$errorMessage = null;
$statusCode = 200;
if ($result->outcome === CrawlOutcomeEnum::Failed) {
$outcome = CrawlOutcomeEnum::Failed;
$errorMessage = $result->errorMessage;
$statusCode = null;
}
$this->pageCrawl->update([
'outcome' => $outcome,
'completed_at' => now(),
'status_code' => $statusCode,
'error_message' => $errorMessage,
]);
}
}

View file

@ -266,6 +266,144 @@ public function test_handle_does_not_retry_after_three_attempts(): void
Queue::assertNotPushed(ProcessCrawlJob::class);
}
public function test_handle_writes_failed_outcome_to_page_crawl(): void
{
Queue::fake();
$fetcher = Mockery::mock(FetchPageAction::class);
$fetcher->shouldReceive('__invoke')->andReturn(new FetchResult(
outcome: CrawlOutcomeEnum::Failed,
statusCode: null,
finalUrl: null,
title: null,
extractedText: null,
outboundLinks: collect(),
wordCount: null,
errorMessage: 'boom',
));
$this->app->instance(FetchPageAction::class, $fetcher);
$page = Page::factory()->createQuietly(['url' => 'https://example.com/unstable']);
$crawl = PageCrawl::factory()->page($page)->createQuietly();
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class));
$this->assertDatabaseHas('page_crawls', [
'id' => $crawl->id,
'outcome' => CrawlOutcomeEnum::Failed->value,
'status_code' => null,
'error_message' => 'boom',
]);
}
public function test_handle_updates_page_to_failed_on_failed_outcome(): void
{
Queue::fake();
$fetcher = Mockery::mock(FetchPageAction::class);
$fetcher->shouldReceive('__invoke')->andReturn(new FetchResult(
outcome: CrawlOutcomeEnum::Failed,
statusCode: null,
finalUrl: null,
title: null,
extractedText: null,
outboundLinks: collect(),
wordCount: null,
errorMessage: 'Connection refused',
));
$this->app->instance(FetchPageAction::class, $fetcher);
$page = Page::factory()->createQuietly(['url' => 'https://example.com/unreachable']);
$crawl = PageCrawl::factory()->page($page)->createQuietly();
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class));
$this->assertSame(PageStatusEnum::Failed, $page->fresh()->status);
}
public function test_handle_updates_page_to_failed_on_blocked_5xx(): void
{
Queue::fake();
$fetcher = Mockery::mock(FetchPageAction::class);
$fetcher->shouldReceive('__invoke')->andReturn(new FetchResult(
outcome: CrawlOutcomeEnum::Blocked5xx,
statusCode: 503,
finalUrl: null,
title: null,
extractedText: null,
outboundLinks: collect(),
wordCount: null,
errorMessage: 'HTTP 503',
));
$this->app->instance(FetchPageAction::class, $fetcher);
$page = Page::factory()->createQuietly(['url' => 'https://example.com/overloaded']);
$crawl = PageCrawl::factory()->page($page)->createQuietly();
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class));
$this->assertSame(PageStatusEnum::Failed, $page->fresh()->status);
}
public function test_handle_updates_page_to_failed_on_blocked_robots(): void
{
Queue::fake();
$fetcher = Mockery::mock(FetchPageAction::class);
$fetcher->shouldReceive('__invoke')->andReturn(new FetchResult(
outcome: CrawlOutcomeEnum::BlockedRobots,
statusCode: null,
finalUrl: null,
title: null,
extractedText: null,
outboundLinks: collect(),
wordCount: null,
errorMessage: 'Disallowed by robots.txt',
));
$this->app->instance(FetchPageAction::class, $fetcher);
$page = Page::factory()->createQuietly(['url' => 'https://example.com/private']);
$crawl = PageCrawl::factory()->page($page)->createQuietly();
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class));
$this->assertSame(PageStatusEnum::Failed, $page->fresh()->status);
}
public function test_handle_does_not_register_outbound_links_on_failure(): void
{
Queue::fake();
$fetcher = Mockery::mock(FetchPageAction::class);
$fetcher->shouldReceive('__invoke')->andReturn(new FetchResult(
outcome: CrawlOutcomeEnum::Failed,
statusCode: null,
finalUrl: null,
title: null,
extractedText: null,
outboundLinks: collect([
'https://should-not-be-registered.com/page',
]),
wordCount: null,
errorMessage: 'Connection refused',
));
$this->app->instance(FetchPageAction::class, $fetcher);
$page = Page::factory()->createQuietly(['url' => 'https://example.com/broken']);
$crawl = PageCrawl::factory()->page($page)->createQuietly();
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class));
$this->assertDatabaseMissing('pages', ['url' => 'https://should-not-be-registered.com/page']);
$this->assertSame(1, Page::count());
}
public function test_handle_registers_outbound_links_on_success(): void
{
Queue::fake();