From 264180cd369d12a56792168266de81787d224f92 Mon Sep 17 00:00:00 2001 From: myrmidex Date: Mon, 27 Apr 2026 01:36:37 +0200 Subject: [PATCH] =?UTF-8?q?chore=20-=20Move=20outcome=20=E2=86=92=20status?= =?UTF-8?q?=20mapping=20into=20CrawlOutcomeEnum=20methods?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- app/Enums/CrawlOutcomeEnum.php | 37 ++++++++ app/Jobs/ProcessCrawlJob.php | 105 +++++++++------------- tests/Unit/Enums/CrawlOutcomeEnumTest.php | 39 ++++++++ 3 files changed, 119 insertions(+), 62 deletions(-) diff --git a/app/Enums/CrawlOutcomeEnum.php b/app/Enums/CrawlOutcomeEnum.php index 949cf69..582fdc9 100644 --- a/app/Enums/CrawlOutcomeEnum.php +++ b/app/Enums/CrawlOutcomeEnum.php @@ -20,4 +20,41 @@ enum CrawlOutcomeEnum: string * prevent re-discovery loops as fediverse re-shares the URL. */ case Rejected = 'rejected'; + + /** + * The PageStatusEnum value the parent `pages` row should land on for this outcome. + */ + public function toPageStatus(): PageStatusEnum + { + return match ($this) { + self::Success => PageStatusEnum::Fetched, + self::Rejected => PageStatusEnum::Rejected, + self::Failed, + self::Timeout, + self::BlockedRobots, + self::Blocked4xx, + self::Blocked5xx => PageStatusEnum::Failed, + }; + } + + /** + * True if the worker should retry this outcome (transient failures only). + * Permanent failures (4xx, robots block, rejected content type) and successes do not retry. + */ + public function isRetryable(): bool + { + return match ($this) { + self::Failed, self::Timeout, self::Blocked5xx => true, + self::Success, self::Rejected, self::BlockedRobots, self::Blocked4xx => false, + }; + } + + /** + * True if the worker should register the outbound links discovered during the fetch. + * Only Success outcomes have meaningful links; everything else either failed or returned no usable HTML. + */ + public function shouldRegisterOutboundLinks(): bool + { + return $this === self::Success; + } } diff --git a/app/Jobs/ProcessCrawlJob.php b/app/Jobs/ProcessCrawlJob.php index d2928d0..7b30a3f 100644 --- a/app/Jobs/ProcessCrawlJob.php +++ b/app/Jobs/ProcessCrawlJob.php @@ -1,10 +1,11 @@ pageCrawl->page->url); - $this->updatePageCrawl($result); + $this->writeOutcome($result); + $this->updatePageStatus($result); - $update = match ($result->outcome) { - CrawlOutcomeEnum::Rejected => [ - 'status' => PageStatusEnum::Rejected, - 'fetched_at' => null, - ], - CrawlOutcomeEnum::Timeout => [ - 'status' => PageStatusEnum::Failed, - 'failed_at' => now(), - ], - CrawlOutcomeEnum::Failed => [ - 'status' => PageStatusEnum::Failed, - ], - CrawlOutcomeEnum::Blocked4xx => [ - 'status' => PageStatusEnum::Failed, - 'failed_at' => now(), - ], - CrawlOutcomeEnum::Blocked5xx => [ - 'status' => PageStatusEnum::Failed, - ], - CrawlOutcomeEnum::BlockedRobots => [ - 'status' => PageStatusEnum::Failed, - ], - default => [ - 'status' => PageStatusEnum::Fetched, + if ($result->outcome->shouldRegisterOutboundLinks()) { + $result->outboundLinks->each(fn (string $url) => $register($url)); + } + + if ($result->outcome->isRetryable()) { + $this->scheduleRetryIfNeeded(); + } + } + + private function writeOutcome(FetchResult $result): void + { + $this->pageCrawl->update([ + 'outcome' => $result->outcome, + 'completed_at' => now(), + 'status_code' => $result->statusCode, + 'error_message' => $result->errorMessage, + ]); + } + + private function updatePageStatus(FetchResult $result): void + { + $status = $result->outcome->toPageStatus(); + + $update = match ($status) { + PageStatusEnum::Fetched => [ + 'status' => $status, 'fetched_at' => now(), 'title' => $result->title, ], + PageStatusEnum::Failed => [ + 'status' => $status, + 'failed_at' => now(), + ], + PageStatusEnum::Rejected => [ + 'status' => $status, + ], + PageStatusEnum::Discovered => [ + 'status' => $status, + ], }; $this->pageCrawl->page->update($update); - - if ($result->outcome !== CrawlOutcomeEnum::Failed) { - $result->outboundLinks->each(fn (string $url) => $register($url)); - } - - if (in_array($result->outcome, [ - CrawlOutcomeEnum::Failed, - CrawlOutcomeEnum::Timeout, - CrawlOutcomeEnum::Blocked5xx, - ])) { - $this->scheduleRetryIfNeeded($result, $this->pageCrawl); - } } - private function scheduleRetryIfNeeded(FetchResult $result, PageCrawl $crawl): void + private function scheduleRetryIfNeeded(): void { - if (PageCrawl::where('page_id', $crawl->page_id)->count() >= 3) { + if (PageCrawl::where('page_id', $this->pageCrawl->page_id)->count() >= 3) { return; } $newRow = PageCrawl::withoutEvents( fn () => PageCrawl::create( - array_merge($crawl->toArray(), [ + array_merge($this->pageCrawl->toArray(), [ 'outcome' => null, ]) ) @@ -101,24 +102,4 @@ private function scheduleRetryIfNeeded(FetchResult $result, PageCrawl $crawl): v ProcessCrawlJob::dispatch($newRow)->delay(now()->addHour()); } - - public function updatePageCrawl(FetchResult $result): void - { - $outcome = CrawlOutcomeEnum::Success; - $errorMessage = null; - $statusCode = 200; - - if ($result->outcome === CrawlOutcomeEnum::Failed) { - $outcome = CrawlOutcomeEnum::Failed; - $errorMessage = $result->errorMessage; - $statusCode = null; - } - - $this->pageCrawl->update([ - 'outcome' => $outcome, - 'completed_at' => now(), - 'status_code' => $statusCode, - 'error_message' => $errorMessage, - ]); - } } diff --git a/tests/Unit/Enums/CrawlOutcomeEnumTest.php b/tests/Unit/Enums/CrawlOutcomeEnumTest.php index 56261cb..17b214d 100644 --- a/tests/Unit/Enums/CrawlOutcomeEnumTest.php +++ b/tests/Unit/Enums/CrawlOutcomeEnumTest.php @@ -5,6 +5,7 @@ namespace Tests\Unit\Enums; use App\Enums\CrawlOutcomeEnum; +use App\Enums\PageStatusEnum; use Tests\TestCase; class CrawlOutcomeEnumTest extends TestCase @@ -33,4 +34,42 @@ public function test_enum_has_exactly_seven_cases(): void { $this->assertCount(7, CrawlOutcomeEnum::cases()); } + + public function test_to_page_status_maps_each_outcome_correctly(): void + { + $this->assertSame(PageStatusEnum::Fetched, CrawlOutcomeEnum::Success->toPageStatus()); + $this->assertSame(PageStatusEnum::Rejected, CrawlOutcomeEnum::Rejected->toPageStatus()); + $this->assertSame(PageStatusEnum::Failed, CrawlOutcomeEnum::Failed->toPageStatus()); + $this->assertSame(PageStatusEnum::Failed, CrawlOutcomeEnum::Timeout->toPageStatus()); + $this->assertSame(PageStatusEnum::Failed, CrawlOutcomeEnum::Blocked4xx->toPageStatus()); + $this->assertSame(PageStatusEnum::Failed, CrawlOutcomeEnum::Blocked5xx->toPageStatus()); + $this->assertSame(PageStatusEnum::Failed, CrawlOutcomeEnum::BlockedRobots->toPageStatus()); + } + + public function test_is_retryable_returns_true_only_for_transient_failures(): void + { + // Retryable: transient network/server problems that may resolve later + $this->assertTrue(CrawlOutcomeEnum::Failed->isRetryable()); + $this->assertTrue(CrawlOutcomeEnum::Timeout->isRetryable()); + $this->assertTrue(CrawlOutcomeEnum::Blocked5xx->isRetryable()); + + // Not retryable: success (done), permanent failures, or policy decisions + $this->assertFalse(CrawlOutcomeEnum::Success->isRetryable()); + $this->assertFalse(CrawlOutcomeEnum::Rejected->isRetryable()); + $this->assertFalse(CrawlOutcomeEnum::BlockedRobots->isRetryable()); + $this->assertFalse(CrawlOutcomeEnum::Blocked4xx->isRetryable()); + } + + public function test_should_register_outbound_links_returns_true_only_for_success(): void + { + $this->assertTrue(CrawlOutcomeEnum::Success->shouldRegisterOutboundLinks()); + + // No links to register on any non-Success outcome + $this->assertFalse(CrawlOutcomeEnum::Failed->shouldRegisterOutboundLinks()); + $this->assertFalse(CrawlOutcomeEnum::Timeout->shouldRegisterOutboundLinks()); + $this->assertFalse(CrawlOutcomeEnum::Rejected->shouldRegisterOutboundLinks()); + $this->assertFalse(CrawlOutcomeEnum::BlockedRobots->shouldRegisterOutboundLinks()); + $this->assertFalse(CrawlOutcomeEnum::Blocked4xx->shouldRegisterOutboundLinks()); + $this->assertFalse(CrawlOutcomeEnum::Blocked5xx->shouldRegisterOutboundLinks()); + } }