14 - Fix ProcessCrawlJob outcome write and status mapping bugs
This commit is contained in:
parent
e8a935ea31
commit
3297c4bb3b
2 changed files with 172 additions and 7 deletions
|
|
@ -26,11 +26,7 @@ public function handle(
|
|||
/** @var FetchResult $result */
|
||||
$result = $fetcher($this->pageCrawl->page->url);
|
||||
|
||||
$this->pageCrawl->update([
|
||||
'outcome' => CrawlOutcomeEnum::Success,
|
||||
'completed_at' => now(),
|
||||
'status_code' => 200,
|
||||
]);
|
||||
$this->updatePageCrawl($result);
|
||||
|
||||
$update = match ($result->outcome) {
|
||||
CrawlOutcomeEnum::Rejected => [
|
||||
|
|
@ -41,10 +37,19 @@ public function handle(
|
|||
'status' => PageStatusEnum::Failed,
|
||||
'failed_at' => now(),
|
||||
],
|
||||
CrawlOutcomeEnum::Failed => [
|
||||
'status' => PageStatusEnum::Failed,
|
||||
],
|
||||
CrawlOutcomeEnum::Blocked4xx => [
|
||||
'status' => PageStatusEnum::Failed,
|
||||
'failed_at' => now(),
|
||||
],
|
||||
CrawlOutcomeEnum::Blocked5xx => [
|
||||
'status' => PageStatusEnum::Failed,
|
||||
],
|
||||
CrawlOutcomeEnum::BlockedRobots => [
|
||||
'status' => PageStatusEnum::Failed,
|
||||
],
|
||||
default => [
|
||||
'status' => PageStatusEnum::Fetched,
|
||||
'fetched_at' => now(),
|
||||
|
|
@ -52,10 +57,12 @@ public function handle(
|
|||
],
|
||||
};
|
||||
|
||||
$result->outboundLinks->each(fn (string $url) => $register($url));
|
||||
|
||||
$this->pageCrawl->page->update($update);
|
||||
|
||||
if ($result->outcome !== CrawlOutcomeEnum::Failed) {
|
||||
$result->outboundLinks->each(fn (string $url) => $register($url));
|
||||
}
|
||||
|
||||
if (in_array($result->outcome, [
|
||||
CrawlOutcomeEnum::Failed,
|
||||
CrawlOutcomeEnum::Timeout,
|
||||
|
|
@ -81,4 +88,24 @@ private function scheduleRetryIfNeeded(FetchResult $result, PageCrawl $crawl): v
|
|||
|
||||
ProcessCrawlJob::dispatch($newRow)->delay(now()->addHour());
|
||||
}
|
||||
|
||||
public function updatePageCrawl(FetchResult $result): void
|
||||
{
|
||||
$outcome = CrawlOutcomeEnum::Success;
|
||||
$errorMessage = null;
|
||||
$statusCode = 200;
|
||||
|
||||
if ($result->outcome === CrawlOutcomeEnum::Failed) {
|
||||
$outcome = CrawlOutcomeEnum::Failed;
|
||||
$errorMessage = $result->errorMessage;
|
||||
$statusCode = null;
|
||||
}
|
||||
|
||||
$this->pageCrawl->update([
|
||||
'outcome' => $outcome,
|
||||
'completed_at' => now(),
|
||||
'status_code' => $statusCode,
|
||||
'error_message' => $errorMessage,
|
||||
]);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -266,6 +266,144 @@ public function test_handle_does_not_retry_after_three_attempts(): void
|
|||
Queue::assertNotPushed(ProcessCrawlJob::class);
|
||||
}
|
||||
|
||||
public function test_handle_writes_failed_outcome_to_page_crawl(): void
|
||||
{
|
||||
Queue::fake();
|
||||
|
||||
$fetcher = Mockery::mock(FetchPageAction::class);
|
||||
$fetcher->shouldReceive('__invoke')->andReturn(new FetchResult(
|
||||
outcome: CrawlOutcomeEnum::Failed,
|
||||
statusCode: null,
|
||||
finalUrl: null,
|
||||
title: null,
|
||||
extractedText: null,
|
||||
outboundLinks: collect(),
|
||||
wordCount: null,
|
||||
errorMessage: 'boom',
|
||||
));
|
||||
$this->app->instance(FetchPageAction::class, $fetcher);
|
||||
|
||||
$page = Page::factory()->createQuietly(['url' => 'https://example.com/unstable']);
|
||||
$crawl = PageCrawl::factory()->page($page)->createQuietly();
|
||||
|
||||
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
|
||||
->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class));
|
||||
|
||||
$this->assertDatabaseHas('page_crawls', [
|
||||
'id' => $crawl->id,
|
||||
'outcome' => CrawlOutcomeEnum::Failed->value,
|
||||
'status_code' => null,
|
||||
'error_message' => 'boom',
|
||||
]);
|
||||
}
|
||||
|
||||
public function test_handle_updates_page_to_failed_on_failed_outcome(): void
|
||||
{
|
||||
Queue::fake();
|
||||
|
||||
$fetcher = Mockery::mock(FetchPageAction::class);
|
||||
$fetcher->shouldReceive('__invoke')->andReturn(new FetchResult(
|
||||
outcome: CrawlOutcomeEnum::Failed,
|
||||
statusCode: null,
|
||||
finalUrl: null,
|
||||
title: null,
|
||||
extractedText: null,
|
||||
outboundLinks: collect(),
|
||||
wordCount: null,
|
||||
errorMessage: 'Connection refused',
|
||||
));
|
||||
$this->app->instance(FetchPageAction::class, $fetcher);
|
||||
|
||||
$page = Page::factory()->createQuietly(['url' => 'https://example.com/unreachable']);
|
||||
$crawl = PageCrawl::factory()->page($page)->createQuietly();
|
||||
|
||||
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
|
||||
->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class));
|
||||
|
||||
$this->assertSame(PageStatusEnum::Failed, $page->fresh()->status);
|
||||
}
|
||||
|
||||
public function test_handle_updates_page_to_failed_on_blocked_5xx(): void
|
||||
{
|
||||
Queue::fake();
|
||||
|
||||
$fetcher = Mockery::mock(FetchPageAction::class);
|
||||
$fetcher->shouldReceive('__invoke')->andReturn(new FetchResult(
|
||||
outcome: CrawlOutcomeEnum::Blocked5xx,
|
||||
statusCode: 503,
|
||||
finalUrl: null,
|
||||
title: null,
|
||||
extractedText: null,
|
||||
outboundLinks: collect(),
|
||||
wordCount: null,
|
||||
errorMessage: 'HTTP 503',
|
||||
));
|
||||
$this->app->instance(FetchPageAction::class, $fetcher);
|
||||
|
||||
$page = Page::factory()->createQuietly(['url' => 'https://example.com/overloaded']);
|
||||
$crawl = PageCrawl::factory()->page($page)->createQuietly();
|
||||
|
||||
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
|
||||
->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class));
|
||||
|
||||
$this->assertSame(PageStatusEnum::Failed, $page->fresh()->status);
|
||||
}
|
||||
|
||||
public function test_handle_updates_page_to_failed_on_blocked_robots(): void
|
||||
{
|
||||
Queue::fake();
|
||||
|
||||
$fetcher = Mockery::mock(FetchPageAction::class);
|
||||
$fetcher->shouldReceive('__invoke')->andReturn(new FetchResult(
|
||||
outcome: CrawlOutcomeEnum::BlockedRobots,
|
||||
statusCode: null,
|
||||
finalUrl: null,
|
||||
title: null,
|
||||
extractedText: null,
|
||||
outboundLinks: collect(),
|
||||
wordCount: null,
|
||||
errorMessage: 'Disallowed by robots.txt',
|
||||
));
|
||||
$this->app->instance(FetchPageAction::class, $fetcher);
|
||||
|
||||
$page = Page::factory()->createQuietly(['url' => 'https://example.com/private']);
|
||||
$crawl = PageCrawl::factory()->page($page)->createQuietly();
|
||||
|
||||
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
|
||||
->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class));
|
||||
|
||||
$this->assertSame(PageStatusEnum::Failed, $page->fresh()->status);
|
||||
}
|
||||
|
||||
public function test_handle_does_not_register_outbound_links_on_failure(): void
|
||||
{
|
||||
Queue::fake();
|
||||
|
||||
$fetcher = Mockery::mock(FetchPageAction::class);
|
||||
$fetcher->shouldReceive('__invoke')->andReturn(new FetchResult(
|
||||
outcome: CrawlOutcomeEnum::Failed,
|
||||
statusCode: null,
|
||||
finalUrl: null,
|
||||
title: null,
|
||||
extractedText: null,
|
||||
outboundLinks: collect([
|
||||
'https://should-not-be-registered.com/page',
|
||||
]),
|
||||
wordCount: null,
|
||||
errorMessage: 'Connection refused',
|
||||
));
|
||||
$this->app->instance(FetchPageAction::class, $fetcher);
|
||||
|
||||
$page = Page::factory()->createQuietly(['url' => 'https://example.com/broken']);
|
||||
$crawl = PageCrawl::factory()->page($page)->createQuietly();
|
||||
|
||||
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
|
||||
->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class));
|
||||
|
||||
$this->assertDatabaseMissing('pages', ['url' => 'https://should-not-be-registered.com/page']);
|
||||
$this->assertSame(1, Page::count());
|
||||
}
|
||||
|
||||
public function test_handle_registers_outbound_links_on_success(): void
|
||||
{
|
||||
Queue::fake();
|
||||
|
|
|
|||
Loading…
Reference in a new issue