chore - Move outcome → status mapping into CrawlOutcomeEnum methods
Some checks failed
CI / ci (push) Failing after 3h0m1s
Some checks failed
CI / ci (push) Failing after 3h0m1s
This commit is contained in:
parent
1538ceeb6e
commit
264180cd36
3 changed files with 119 additions and 62 deletions
|
|
@ -20,4 +20,41 @@ enum CrawlOutcomeEnum: string
|
|||
* prevent re-discovery loops as fediverse re-shares the URL.
|
||||
*/
|
||||
case Rejected = 'rejected';
|
||||
|
||||
/**
|
||||
* The PageStatusEnum value the parent `pages` row should land on for this outcome.
|
||||
*/
|
||||
public function toPageStatus(): PageStatusEnum
|
||||
{
|
||||
return match ($this) {
|
||||
self::Success => PageStatusEnum::Fetched,
|
||||
self::Rejected => PageStatusEnum::Rejected,
|
||||
self::Failed,
|
||||
self::Timeout,
|
||||
self::BlockedRobots,
|
||||
self::Blocked4xx,
|
||||
self::Blocked5xx => PageStatusEnum::Failed,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* True if the worker should retry this outcome (transient failures only).
|
||||
* Permanent failures (4xx, robots block, rejected content type) and successes do not retry.
|
||||
*/
|
||||
public function isRetryable(): bool
|
||||
{
|
||||
return match ($this) {
|
||||
self::Failed, self::Timeout, self::Blocked5xx => true,
|
||||
self::Success, self::Rejected, self::BlockedRobots, self::Blocked4xx => false,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* True if the worker should register the outbound links discovered during the fetch.
|
||||
* Only Success outcomes have meaningful links; everything else either failed or returned no usable HTML.
|
||||
*/
|
||||
public function shouldRegisterOutboundLinks(): bool
|
||||
{
|
||||
return $this === self::Success;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,10 +1,11 @@
|
|||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace App\Jobs;
|
||||
|
||||
use App\Actions\FetchPageAction;
|
||||
use App\Actions\RegisterDiscoveredPageAction;
|
||||
use App\Enums\CrawlOutcomeEnum;
|
||||
use App\Enums\PageStatusEnum;
|
||||
use App\Models\PageCrawl;
|
||||
use App\Services\PolitenessService;
|
||||
|
|
@ -36,64 +37,64 @@ public function handle(): void
|
|||
return;
|
||||
}
|
||||
|
||||
/** @var FetchResult $result */
|
||||
$result = $fetcher($this->pageCrawl->page->url);
|
||||
|
||||
$this->updatePageCrawl($result);
|
||||
$this->writeOutcome($result);
|
||||
$this->updatePageStatus($result);
|
||||
|
||||
$update = match ($result->outcome) {
|
||||
CrawlOutcomeEnum::Rejected => [
|
||||
'status' => PageStatusEnum::Rejected,
|
||||
'fetched_at' => null,
|
||||
],
|
||||
CrawlOutcomeEnum::Timeout => [
|
||||
'status' => PageStatusEnum::Failed,
|
||||
'failed_at' => now(),
|
||||
],
|
||||
CrawlOutcomeEnum::Failed => [
|
||||
'status' => PageStatusEnum::Failed,
|
||||
],
|
||||
CrawlOutcomeEnum::Blocked4xx => [
|
||||
'status' => PageStatusEnum::Failed,
|
||||
'failed_at' => now(),
|
||||
],
|
||||
CrawlOutcomeEnum::Blocked5xx => [
|
||||
'status' => PageStatusEnum::Failed,
|
||||
],
|
||||
CrawlOutcomeEnum::BlockedRobots => [
|
||||
'status' => PageStatusEnum::Failed,
|
||||
],
|
||||
default => [
|
||||
'status' => PageStatusEnum::Fetched,
|
||||
if ($result->outcome->shouldRegisterOutboundLinks()) {
|
||||
$result->outboundLinks->each(fn (string $url) => $register($url));
|
||||
}
|
||||
|
||||
if ($result->outcome->isRetryable()) {
|
||||
$this->scheduleRetryIfNeeded();
|
||||
}
|
||||
}
|
||||
|
||||
private function writeOutcome(FetchResult $result): void
|
||||
{
|
||||
$this->pageCrawl->update([
|
||||
'outcome' => $result->outcome,
|
||||
'completed_at' => now(),
|
||||
'status_code' => $result->statusCode,
|
||||
'error_message' => $result->errorMessage,
|
||||
]);
|
||||
}
|
||||
|
||||
private function updatePageStatus(FetchResult $result): void
|
||||
{
|
||||
$status = $result->outcome->toPageStatus();
|
||||
|
||||
$update = match ($status) {
|
||||
PageStatusEnum::Fetched => [
|
||||
'status' => $status,
|
||||
'fetched_at' => now(),
|
||||
'title' => $result->title,
|
||||
],
|
||||
PageStatusEnum::Failed => [
|
||||
'status' => $status,
|
||||
'failed_at' => now(),
|
||||
],
|
||||
PageStatusEnum::Rejected => [
|
||||
'status' => $status,
|
||||
],
|
||||
PageStatusEnum::Discovered => [
|
||||
'status' => $status,
|
||||
],
|
||||
};
|
||||
|
||||
$this->pageCrawl->page->update($update);
|
||||
|
||||
if ($result->outcome !== CrawlOutcomeEnum::Failed) {
|
||||
$result->outboundLinks->each(fn (string $url) => $register($url));
|
||||
}
|
||||
|
||||
if (in_array($result->outcome, [
|
||||
CrawlOutcomeEnum::Failed,
|
||||
CrawlOutcomeEnum::Timeout,
|
||||
CrawlOutcomeEnum::Blocked5xx,
|
||||
])) {
|
||||
$this->scheduleRetryIfNeeded($result, $this->pageCrawl);
|
||||
}
|
||||
}
|
||||
|
||||
private function scheduleRetryIfNeeded(FetchResult $result, PageCrawl $crawl): void
|
||||
private function scheduleRetryIfNeeded(): void
|
||||
{
|
||||
if (PageCrawl::where('page_id', $crawl->page_id)->count() >= 3) {
|
||||
if (PageCrawl::where('page_id', $this->pageCrawl->page_id)->count() >= 3) {
|
||||
return;
|
||||
}
|
||||
|
||||
$newRow = PageCrawl::withoutEvents(
|
||||
fn () => PageCrawl::create(
|
||||
array_merge($crawl->toArray(), [
|
||||
array_merge($this->pageCrawl->toArray(), [
|
||||
'outcome' => null,
|
||||
])
|
||||
)
|
||||
|
|
@ -101,24 +102,4 @@ private function scheduleRetryIfNeeded(FetchResult $result, PageCrawl $crawl): v
|
|||
|
||||
ProcessCrawlJob::dispatch($newRow)->delay(now()->addHour());
|
||||
}
|
||||
|
||||
public function updatePageCrawl(FetchResult $result): void
|
||||
{
|
||||
$outcome = CrawlOutcomeEnum::Success;
|
||||
$errorMessage = null;
|
||||
$statusCode = 200;
|
||||
|
||||
if ($result->outcome === CrawlOutcomeEnum::Failed) {
|
||||
$outcome = CrawlOutcomeEnum::Failed;
|
||||
$errorMessage = $result->errorMessage;
|
||||
$statusCode = null;
|
||||
}
|
||||
|
||||
$this->pageCrawl->update([
|
||||
'outcome' => $outcome,
|
||||
'completed_at' => now(),
|
||||
'status_code' => $statusCode,
|
||||
'error_message' => $errorMessage,
|
||||
]);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -5,6 +5,7 @@
|
|||
namespace Tests\Unit\Enums;
|
||||
|
||||
use App\Enums\CrawlOutcomeEnum;
|
||||
use App\Enums\PageStatusEnum;
|
||||
use Tests\TestCase;
|
||||
|
||||
class CrawlOutcomeEnumTest extends TestCase
|
||||
|
|
@ -33,4 +34,42 @@ public function test_enum_has_exactly_seven_cases(): void
|
|||
{
|
||||
$this->assertCount(7, CrawlOutcomeEnum::cases());
|
||||
}
|
||||
|
||||
public function test_to_page_status_maps_each_outcome_correctly(): void
|
||||
{
|
||||
$this->assertSame(PageStatusEnum::Fetched, CrawlOutcomeEnum::Success->toPageStatus());
|
||||
$this->assertSame(PageStatusEnum::Rejected, CrawlOutcomeEnum::Rejected->toPageStatus());
|
||||
$this->assertSame(PageStatusEnum::Failed, CrawlOutcomeEnum::Failed->toPageStatus());
|
||||
$this->assertSame(PageStatusEnum::Failed, CrawlOutcomeEnum::Timeout->toPageStatus());
|
||||
$this->assertSame(PageStatusEnum::Failed, CrawlOutcomeEnum::Blocked4xx->toPageStatus());
|
||||
$this->assertSame(PageStatusEnum::Failed, CrawlOutcomeEnum::Blocked5xx->toPageStatus());
|
||||
$this->assertSame(PageStatusEnum::Failed, CrawlOutcomeEnum::BlockedRobots->toPageStatus());
|
||||
}
|
||||
|
||||
public function test_is_retryable_returns_true_only_for_transient_failures(): void
|
||||
{
|
||||
// Retryable: transient network/server problems that may resolve later
|
||||
$this->assertTrue(CrawlOutcomeEnum::Failed->isRetryable());
|
||||
$this->assertTrue(CrawlOutcomeEnum::Timeout->isRetryable());
|
||||
$this->assertTrue(CrawlOutcomeEnum::Blocked5xx->isRetryable());
|
||||
|
||||
// Not retryable: success (done), permanent failures, or policy decisions
|
||||
$this->assertFalse(CrawlOutcomeEnum::Success->isRetryable());
|
||||
$this->assertFalse(CrawlOutcomeEnum::Rejected->isRetryable());
|
||||
$this->assertFalse(CrawlOutcomeEnum::BlockedRobots->isRetryable());
|
||||
$this->assertFalse(CrawlOutcomeEnum::Blocked4xx->isRetryable());
|
||||
}
|
||||
|
||||
public function test_should_register_outbound_links_returns_true_only_for_success(): void
|
||||
{
|
||||
$this->assertTrue(CrawlOutcomeEnum::Success->shouldRegisterOutboundLinks());
|
||||
|
||||
// No links to register on any non-Success outcome
|
||||
$this->assertFalse(CrawlOutcomeEnum::Failed->shouldRegisterOutboundLinks());
|
||||
$this->assertFalse(CrawlOutcomeEnum::Timeout->shouldRegisterOutboundLinks());
|
||||
$this->assertFalse(CrawlOutcomeEnum::Rejected->shouldRegisterOutboundLinks());
|
||||
$this->assertFalse(CrawlOutcomeEnum::BlockedRobots->shouldRegisterOutboundLinks());
|
||||
$this->assertFalse(CrawlOutcomeEnum::Blocked4xx->shouldRegisterOutboundLinks());
|
||||
$this->assertFalse(CrawlOutcomeEnum::Blocked5xx->shouldRegisterOutboundLinks());
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in a new issue