chore - Move outcome → status mapping into CrawlOutcomeEnum methods
Some checks failed
CI / ci (push) Failing after 3h0m1s

This commit is contained in:
myrmidex 2026-04-27 01:36:37 +02:00
parent 1538ceeb6e
commit 264180cd36
3 changed files with 119 additions and 62 deletions

View file

@ -20,4 +20,41 @@ enum CrawlOutcomeEnum: string
* prevent re-discovery loops as fediverse re-shares the URL.
*/
case Rejected = 'rejected';
/**
* The PageStatusEnum value the parent `pages` row should land on for this outcome.
*/
public function toPageStatus(): PageStatusEnum
{
return match ($this) {
self::Success => PageStatusEnum::Fetched,
self::Rejected => PageStatusEnum::Rejected,
self::Failed,
self::Timeout,
self::BlockedRobots,
self::Blocked4xx,
self::Blocked5xx => PageStatusEnum::Failed,
};
}
/**
* True if the worker should retry this outcome (transient failures only).
* Permanent failures (4xx, robots block, rejected content type) and successes do not retry.
*/
public function isRetryable(): bool
{
return match ($this) {
self::Failed, self::Timeout, self::Blocked5xx => true,
self::Success, self::Rejected, self::BlockedRobots, self::Blocked4xx => false,
};
}
/**
* True if the worker should register the outbound links discovered during the fetch.
* Only Success outcomes have meaningful links; everything else either failed or returned no usable HTML.
*/
public function shouldRegisterOutboundLinks(): bool
{
return $this === self::Success;
}
}

View file

@ -1,10 +1,11 @@
<?php
declare(strict_types=1);
namespace App\Jobs;
use App\Actions\FetchPageAction;
use App\Actions\RegisterDiscoveredPageAction;
use App\Enums\CrawlOutcomeEnum;
use App\Enums\PageStatusEnum;
use App\Models\PageCrawl;
use App\Services\PolitenessService;
@ -36,64 +37,64 @@ public function handle(): void
return;
}
/** @var FetchResult $result */
$result = $fetcher($this->pageCrawl->page->url);
$this->updatePageCrawl($result);
$this->writeOutcome($result);
$this->updatePageStatus($result);
$update = match ($result->outcome) {
CrawlOutcomeEnum::Rejected => [
'status' => PageStatusEnum::Rejected,
'fetched_at' => null,
],
CrawlOutcomeEnum::Timeout => [
'status' => PageStatusEnum::Failed,
'failed_at' => now(),
],
CrawlOutcomeEnum::Failed => [
'status' => PageStatusEnum::Failed,
],
CrawlOutcomeEnum::Blocked4xx => [
'status' => PageStatusEnum::Failed,
'failed_at' => now(),
],
CrawlOutcomeEnum::Blocked5xx => [
'status' => PageStatusEnum::Failed,
],
CrawlOutcomeEnum::BlockedRobots => [
'status' => PageStatusEnum::Failed,
],
default => [
'status' => PageStatusEnum::Fetched,
if ($result->outcome->shouldRegisterOutboundLinks()) {
$result->outboundLinks->each(fn (string $url) => $register($url));
}
if ($result->outcome->isRetryable()) {
$this->scheduleRetryIfNeeded();
}
}
private function writeOutcome(FetchResult $result): void
{
$this->pageCrawl->update([
'outcome' => $result->outcome,
'completed_at' => now(),
'status_code' => $result->statusCode,
'error_message' => $result->errorMessage,
]);
}
private function updatePageStatus(FetchResult $result): void
{
$status = $result->outcome->toPageStatus();
$update = match ($status) {
PageStatusEnum::Fetched => [
'status' => $status,
'fetched_at' => now(),
'title' => $result->title,
],
PageStatusEnum::Failed => [
'status' => $status,
'failed_at' => now(),
],
PageStatusEnum::Rejected => [
'status' => $status,
],
PageStatusEnum::Discovered => [
'status' => $status,
],
};
$this->pageCrawl->page->update($update);
if ($result->outcome !== CrawlOutcomeEnum::Failed) {
$result->outboundLinks->each(fn (string $url) => $register($url));
}
if (in_array($result->outcome, [
CrawlOutcomeEnum::Failed,
CrawlOutcomeEnum::Timeout,
CrawlOutcomeEnum::Blocked5xx,
])) {
$this->scheduleRetryIfNeeded($result, $this->pageCrawl);
}
}
private function scheduleRetryIfNeeded(FetchResult $result, PageCrawl $crawl): void
private function scheduleRetryIfNeeded(): void
{
if (PageCrawl::where('page_id', $crawl->page_id)->count() >= 3) {
if (PageCrawl::where('page_id', $this->pageCrawl->page_id)->count() >= 3) {
return;
}
$newRow = PageCrawl::withoutEvents(
fn () => PageCrawl::create(
array_merge($crawl->toArray(), [
array_merge($this->pageCrawl->toArray(), [
'outcome' => null,
])
)
@ -101,24 +102,4 @@ private function scheduleRetryIfNeeded(FetchResult $result, PageCrawl $crawl): v
ProcessCrawlJob::dispatch($newRow)->delay(now()->addHour());
}
public function updatePageCrawl(FetchResult $result): void
{
$outcome = CrawlOutcomeEnum::Success;
$errorMessage = null;
$statusCode = 200;
if ($result->outcome === CrawlOutcomeEnum::Failed) {
$outcome = CrawlOutcomeEnum::Failed;
$errorMessage = $result->errorMessage;
$statusCode = null;
}
$this->pageCrawl->update([
'outcome' => $outcome,
'completed_at' => now(),
'status_code' => $statusCode,
'error_message' => $errorMessage,
]);
}
}

View file

@ -5,6 +5,7 @@
namespace Tests\Unit\Enums;
use App\Enums\CrawlOutcomeEnum;
use App\Enums\PageStatusEnum;
use Tests\TestCase;
class CrawlOutcomeEnumTest extends TestCase
@ -33,4 +34,42 @@ public function test_enum_has_exactly_seven_cases(): void
{
$this->assertCount(7, CrawlOutcomeEnum::cases());
}
public function test_to_page_status_maps_each_outcome_correctly(): void
{
$this->assertSame(PageStatusEnum::Fetched, CrawlOutcomeEnum::Success->toPageStatus());
$this->assertSame(PageStatusEnum::Rejected, CrawlOutcomeEnum::Rejected->toPageStatus());
$this->assertSame(PageStatusEnum::Failed, CrawlOutcomeEnum::Failed->toPageStatus());
$this->assertSame(PageStatusEnum::Failed, CrawlOutcomeEnum::Timeout->toPageStatus());
$this->assertSame(PageStatusEnum::Failed, CrawlOutcomeEnum::Blocked4xx->toPageStatus());
$this->assertSame(PageStatusEnum::Failed, CrawlOutcomeEnum::Blocked5xx->toPageStatus());
$this->assertSame(PageStatusEnum::Failed, CrawlOutcomeEnum::BlockedRobots->toPageStatus());
}
public function test_is_retryable_returns_true_only_for_transient_failures(): void
{
// Retryable: transient network/server problems that may resolve later
$this->assertTrue(CrawlOutcomeEnum::Failed->isRetryable());
$this->assertTrue(CrawlOutcomeEnum::Timeout->isRetryable());
$this->assertTrue(CrawlOutcomeEnum::Blocked5xx->isRetryable());
// Not retryable: success (done), permanent failures, or policy decisions
$this->assertFalse(CrawlOutcomeEnum::Success->isRetryable());
$this->assertFalse(CrawlOutcomeEnum::Rejected->isRetryable());
$this->assertFalse(CrawlOutcomeEnum::BlockedRobots->isRetryable());
$this->assertFalse(CrawlOutcomeEnum::Blocked4xx->isRetryable());
}
public function test_should_register_outbound_links_returns_true_only_for_success(): void
{
$this->assertTrue(CrawlOutcomeEnum::Success->shouldRegisterOutboundLinks());
// No links to register on any non-Success outcome
$this->assertFalse(CrawlOutcomeEnum::Failed->shouldRegisterOutboundLinks());
$this->assertFalse(CrawlOutcomeEnum::Timeout->shouldRegisterOutboundLinks());
$this->assertFalse(CrawlOutcomeEnum::Rejected->shouldRegisterOutboundLinks());
$this->assertFalse(CrawlOutcomeEnum::BlockedRobots->shouldRegisterOutboundLinks());
$this->assertFalse(CrawlOutcomeEnum::Blocked4xx->shouldRegisterOutboundLinks());
$this->assertFalse(CrawlOutcomeEnum::Blocked5xx->shouldRegisterOutboundLinks());
}
}