14 - Fix ProcessCrawlJob outcome write and status mapping bugs
This commit is contained in:
parent
e8a935ea31
commit
3297c4bb3b
2 changed files with 172 additions and 7 deletions
|
|
@ -26,11 +26,7 @@ public function handle(
|
||||||
/** @var FetchResult $result */
|
/** @var FetchResult $result */
|
||||||
$result = $fetcher($this->pageCrawl->page->url);
|
$result = $fetcher($this->pageCrawl->page->url);
|
||||||
|
|
||||||
$this->pageCrawl->update([
|
$this->updatePageCrawl($result);
|
||||||
'outcome' => CrawlOutcomeEnum::Success,
|
|
||||||
'completed_at' => now(),
|
|
||||||
'status_code' => 200,
|
|
||||||
]);
|
|
||||||
|
|
||||||
$update = match ($result->outcome) {
|
$update = match ($result->outcome) {
|
||||||
CrawlOutcomeEnum::Rejected => [
|
CrawlOutcomeEnum::Rejected => [
|
||||||
|
|
@ -41,10 +37,19 @@ public function handle(
|
||||||
'status' => PageStatusEnum::Failed,
|
'status' => PageStatusEnum::Failed,
|
||||||
'failed_at' => now(),
|
'failed_at' => now(),
|
||||||
],
|
],
|
||||||
|
CrawlOutcomeEnum::Failed => [
|
||||||
|
'status' => PageStatusEnum::Failed,
|
||||||
|
],
|
||||||
CrawlOutcomeEnum::Blocked4xx => [
|
CrawlOutcomeEnum::Blocked4xx => [
|
||||||
'status' => PageStatusEnum::Failed,
|
'status' => PageStatusEnum::Failed,
|
||||||
'failed_at' => now(),
|
'failed_at' => now(),
|
||||||
],
|
],
|
||||||
|
CrawlOutcomeEnum::Blocked5xx => [
|
||||||
|
'status' => PageStatusEnum::Failed,
|
||||||
|
],
|
||||||
|
CrawlOutcomeEnum::BlockedRobots => [
|
||||||
|
'status' => PageStatusEnum::Failed,
|
||||||
|
],
|
||||||
default => [
|
default => [
|
||||||
'status' => PageStatusEnum::Fetched,
|
'status' => PageStatusEnum::Fetched,
|
||||||
'fetched_at' => now(),
|
'fetched_at' => now(),
|
||||||
|
|
@ -52,10 +57,12 @@ public function handle(
|
||||||
],
|
],
|
||||||
};
|
};
|
||||||
|
|
||||||
$result->outboundLinks->each(fn (string $url) => $register($url));
|
|
||||||
|
|
||||||
$this->pageCrawl->page->update($update);
|
$this->pageCrawl->page->update($update);
|
||||||
|
|
||||||
|
if ($result->outcome !== CrawlOutcomeEnum::Failed) {
|
||||||
|
$result->outboundLinks->each(fn (string $url) => $register($url));
|
||||||
|
}
|
||||||
|
|
||||||
if (in_array($result->outcome, [
|
if (in_array($result->outcome, [
|
||||||
CrawlOutcomeEnum::Failed,
|
CrawlOutcomeEnum::Failed,
|
||||||
CrawlOutcomeEnum::Timeout,
|
CrawlOutcomeEnum::Timeout,
|
||||||
|
|
@ -81,4 +88,24 @@ private function scheduleRetryIfNeeded(FetchResult $result, PageCrawl $crawl): v
|
||||||
|
|
||||||
ProcessCrawlJob::dispatch($newRow)->delay(now()->addHour());
|
ProcessCrawlJob::dispatch($newRow)->delay(now()->addHour());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public function updatePageCrawl(FetchResult $result): void
|
||||||
|
{
|
||||||
|
$outcome = CrawlOutcomeEnum::Success;
|
||||||
|
$errorMessage = null;
|
||||||
|
$statusCode = 200;
|
||||||
|
|
||||||
|
if ($result->outcome === CrawlOutcomeEnum::Failed) {
|
||||||
|
$outcome = CrawlOutcomeEnum::Failed;
|
||||||
|
$errorMessage = $result->errorMessage;
|
||||||
|
$statusCode = null;
|
||||||
|
}
|
||||||
|
|
||||||
|
$this->pageCrawl->update([
|
||||||
|
'outcome' => $outcome,
|
||||||
|
'completed_at' => now(),
|
||||||
|
'status_code' => $statusCode,
|
||||||
|
'error_message' => $errorMessage,
|
||||||
|
]);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -266,6 +266,144 @@ public function test_handle_does_not_retry_after_three_attempts(): void
|
||||||
Queue::assertNotPushed(ProcessCrawlJob::class);
|
Queue::assertNotPushed(ProcessCrawlJob::class);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public function test_handle_writes_failed_outcome_to_page_crawl(): void
|
||||||
|
{
|
||||||
|
Queue::fake();
|
||||||
|
|
||||||
|
$fetcher = Mockery::mock(FetchPageAction::class);
|
||||||
|
$fetcher->shouldReceive('__invoke')->andReturn(new FetchResult(
|
||||||
|
outcome: CrawlOutcomeEnum::Failed,
|
||||||
|
statusCode: null,
|
||||||
|
finalUrl: null,
|
||||||
|
title: null,
|
||||||
|
extractedText: null,
|
||||||
|
outboundLinks: collect(),
|
||||||
|
wordCount: null,
|
||||||
|
errorMessage: 'boom',
|
||||||
|
));
|
||||||
|
$this->app->instance(FetchPageAction::class, $fetcher);
|
||||||
|
|
||||||
|
$page = Page::factory()->createQuietly(['url' => 'https://example.com/unstable']);
|
||||||
|
$crawl = PageCrawl::factory()->page($page)->createQuietly();
|
||||||
|
|
||||||
|
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
|
||||||
|
->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class));
|
||||||
|
|
||||||
|
$this->assertDatabaseHas('page_crawls', [
|
||||||
|
'id' => $crawl->id,
|
||||||
|
'outcome' => CrawlOutcomeEnum::Failed->value,
|
||||||
|
'status_code' => null,
|
||||||
|
'error_message' => 'boom',
|
||||||
|
]);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function test_handle_updates_page_to_failed_on_failed_outcome(): void
|
||||||
|
{
|
||||||
|
Queue::fake();
|
||||||
|
|
||||||
|
$fetcher = Mockery::mock(FetchPageAction::class);
|
||||||
|
$fetcher->shouldReceive('__invoke')->andReturn(new FetchResult(
|
||||||
|
outcome: CrawlOutcomeEnum::Failed,
|
||||||
|
statusCode: null,
|
||||||
|
finalUrl: null,
|
||||||
|
title: null,
|
||||||
|
extractedText: null,
|
||||||
|
outboundLinks: collect(),
|
||||||
|
wordCount: null,
|
||||||
|
errorMessage: 'Connection refused',
|
||||||
|
));
|
||||||
|
$this->app->instance(FetchPageAction::class, $fetcher);
|
||||||
|
|
||||||
|
$page = Page::factory()->createQuietly(['url' => 'https://example.com/unreachable']);
|
||||||
|
$crawl = PageCrawl::factory()->page($page)->createQuietly();
|
||||||
|
|
||||||
|
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
|
||||||
|
->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class));
|
||||||
|
|
||||||
|
$this->assertSame(PageStatusEnum::Failed, $page->fresh()->status);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function test_handle_updates_page_to_failed_on_blocked_5xx(): void
|
||||||
|
{
|
||||||
|
Queue::fake();
|
||||||
|
|
||||||
|
$fetcher = Mockery::mock(FetchPageAction::class);
|
||||||
|
$fetcher->shouldReceive('__invoke')->andReturn(new FetchResult(
|
||||||
|
outcome: CrawlOutcomeEnum::Blocked5xx,
|
||||||
|
statusCode: 503,
|
||||||
|
finalUrl: null,
|
||||||
|
title: null,
|
||||||
|
extractedText: null,
|
||||||
|
outboundLinks: collect(),
|
||||||
|
wordCount: null,
|
||||||
|
errorMessage: 'HTTP 503',
|
||||||
|
));
|
||||||
|
$this->app->instance(FetchPageAction::class, $fetcher);
|
||||||
|
|
||||||
|
$page = Page::factory()->createQuietly(['url' => 'https://example.com/overloaded']);
|
||||||
|
$crawl = PageCrawl::factory()->page($page)->createQuietly();
|
||||||
|
|
||||||
|
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
|
||||||
|
->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class));
|
||||||
|
|
||||||
|
$this->assertSame(PageStatusEnum::Failed, $page->fresh()->status);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function test_handle_updates_page_to_failed_on_blocked_robots(): void
|
||||||
|
{
|
||||||
|
Queue::fake();
|
||||||
|
|
||||||
|
$fetcher = Mockery::mock(FetchPageAction::class);
|
||||||
|
$fetcher->shouldReceive('__invoke')->andReturn(new FetchResult(
|
||||||
|
outcome: CrawlOutcomeEnum::BlockedRobots,
|
||||||
|
statusCode: null,
|
||||||
|
finalUrl: null,
|
||||||
|
title: null,
|
||||||
|
extractedText: null,
|
||||||
|
outboundLinks: collect(),
|
||||||
|
wordCount: null,
|
||||||
|
errorMessage: 'Disallowed by robots.txt',
|
||||||
|
));
|
||||||
|
$this->app->instance(FetchPageAction::class, $fetcher);
|
||||||
|
|
||||||
|
$page = Page::factory()->createQuietly(['url' => 'https://example.com/private']);
|
||||||
|
$crawl = PageCrawl::factory()->page($page)->createQuietly();
|
||||||
|
|
||||||
|
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
|
||||||
|
->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class));
|
||||||
|
|
||||||
|
$this->assertSame(PageStatusEnum::Failed, $page->fresh()->status);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function test_handle_does_not_register_outbound_links_on_failure(): void
|
||||||
|
{
|
||||||
|
Queue::fake();
|
||||||
|
|
||||||
|
$fetcher = Mockery::mock(FetchPageAction::class);
|
||||||
|
$fetcher->shouldReceive('__invoke')->andReturn(new FetchResult(
|
||||||
|
outcome: CrawlOutcomeEnum::Failed,
|
||||||
|
statusCode: null,
|
||||||
|
finalUrl: null,
|
||||||
|
title: null,
|
||||||
|
extractedText: null,
|
||||||
|
outboundLinks: collect([
|
||||||
|
'https://should-not-be-registered.com/page',
|
||||||
|
]),
|
||||||
|
wordCount: null,
|
||||||
|
errorMessage: 'Connection refused',
|
||||||
|
));
|
||||||
|
$this->app->instance(FetchPageAction::class, $fetcher);
|
||||||
|
|
||||||
|
$page = Page::factory()->createQuietly(['url' => 'https://example.com/broken']);
|
||||||
|
$crawl = PageCrawl::factory()->page($page)->createQuietly();
|
||||||
|
|
||||||
|
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
|
||||||
|
->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class));
|
||||||
|
|
||||||
|
$this->assertDatabaseMissing('pages', ['url' => 'https://should-not-be-registered.com/page']);
|
||||||
|
$this->assertSame(1, Page::count());
|
||||||
|
}
|
||||||
|
|
||||||
public function test_handle_registers_outbound_links_on_success(): void
|
public function test_handle_registers_outbound_links_on_success(): void
|
||||||
{
|
{
|
||||||
Queue::fake();
|
Queue::fake();
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue