chore - Move outcome → status mapping into CrawlOutcomeEnum methods
Some checks failed
CI / ci (push) Failing after 3h0m1s
Some checks failed
CI / ci (push) Failing after 3h0m1s
This commit is contained in:
parent
1538ceeb6e
commit
264180cd36
3 changed files with 119 additions and 62 deletions
|
|
@ -20,4 +20,41 @@ enum CrawlOutcomeEnum: string
|
||||||
* prevent re-discovery loops as fediverse re-shares the URL.
|
* prevent re-discovery loops as fediverse re-shares the URL.
|
||||||
*/
|
*/
|
||||||
case Rejected = 'rejected';
|
case Rejected = 'rejected';
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The PageStatusEnum value the parent `pages` row should land on for this outcome.
|
||||||
|
*/
|
||||||
|
public function toPageStatus(): PageStatusEnum
|
||||||
|
{
|
||||||
|
return match ($this) {
|
||||||
|
self::Success => PageStatusEnum::Fetched,
|
||||||
|
self::Rejected => PageStatusEnum::Rejected,
|
||||||
|
self::Failed,
|
||||||
|
self::Timeout,
|
||||||
|
self::BlockedRobots,
|
||||||
|
self::Blocked4xx,
|
||||||
|
self::Blocked5xx => PageStatusEnum::Failed,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* True if the worker should retry this outcome (transient failures only).
|
||||||
|
* Permanent failures (4xx, robots block, rejected content type) and successes do not retry.
|
||||||
|
*/
|
||||||
|
public function isRetryable(): bool
|
||||||
|
{
|
||||||
|
return match ($this) {
|
||||||
|
self::Failed, self::Timeout, self::Blocked5xx => true,
|
||||||
|
self::Success, self::Rejected, self::BlockedRobots, self::Blocked4xx => false,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* True if the worker should register the outbound links discovered during the fetch.
|
||||||
|
* Only Success outcomes have meaningful links; everything else either failed or returned no usable HTML.
|
||||||
|
*/
|
||||||
|
public function shouldRegisterOutboundLinks(): bool
|
||||||
|
{
|
||||||
|
return $this === self::Success;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -1,10 +1,11 @@
|
||||||
<?php
|
<?php
|
||||||
|
|
||||||
|
declare(strict_types=1);
|
||||||
|
|
||||||
namespace App\Jobs;
|
namespace App\Jobs;
|
||||||
|
|
||||||
use App\Actions\FetchPageAction;
|
use App\Actions\FetchPageAction;
|
||||||
use App\Actions\RegisterDiscoveredPageAction;
|
use App\Actions\RegisterDiscoveredPageAction;
|
||||||
use App\Enums\CrawlOutcomeEnum;
|
|
||||||
use App\Enums\PageStatusEnum;
|
use App\Enums\PageStatusEnum;
|
||||||
use App\Models\PageCrawl;
|
use App\Models\PageCrawl;
|
||||||
use App\Services\PolitenessService;
|
use App\Services\PolitenessService;
|
||||||
|
|
@ -36,64 +37,64 @@ public function handle(): void
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
/** @var FetchResult $result */
|
|
||||||
$result = $fetcher($this->pageCrawl->page->url);
|
$result = $fetcher($this->pageCrawl->page->url);
|
||||||
|
|
||||||
$this->updatePageCrawl($result);
|
$this->writeOutcome($result);
|
||||||
|
$this->updatePageStatus($result);
|
||||||
|
|
||||||
$update = match ($result->outcome) {
|
if ($result->outcome->shouldRegisterOutboundLinks()) {
|
||||||
CrawlOutcomeEnum::Rejected => [
|
$result->outboundLinks->each(fn (string $url) => $register($url));
|
||||||
'status' => PageStatusEnum::Rejected,
|
}
|
||||||
'fetched_at' => null,
|
|
||||||
],
|
if ($result->outcome->isRetryable()) {
|
||||||
CrawlOutcomeEnum::Timeout => [
|
$this->scheduleRetryIfNeeded();
|
||||||
'status' => PageStatusEnum::Failed,
|
}
|
||||||
'failed_at' => now(),
|
}
|
||||||
],
|
|
||||||
CrawlOutcomeEnum::Failed => [
|
private function writeOutcome(FetchResult $result): void
|
||||||
'status' => PageStatusEnum::Failed,
|
{
|
||||||
],
|
$this->pageCrawl->update([
|
||||||
CrawlOutcomeEnum::Blocked4xx => [
|
'outcome' => $result->outcome,
|
||||||
'status' => PageStatusEnum::Failed,
|
'completed_at' => now(),
|
||||||
'failed_at' => now(),
|
'status_code' => $result->statusCode,
|
||||||
],
|
'error_message' => $result->errorMessage,
|
||||||
CrawlOutcomeEnum::Blocked5xx => [
|
]);
|
||||||
'status' => PageStatusEnum::Failed,
|
}
|
||||||
],
|
|
||||||
CrawlOutcomeEnum::BlockedRobots => [
|
private function updatePageStatus(FetchResult $result): void
|
||||||
'status' => PageStatusEnum::Failed,
|
{
|
||||||
],
|
$status = $result->outcome->toPageStatus();
|
||||||
default => [
|
|
||||||
'status' => PageStatusEnum::Fetched,
|
$update = match ($status) {
|
||||||
|
PageStatusEnum::Fetched => [
|
||||||
|
'status' => $status,
|
||||||
'fetched_at' => now(),
|
'fetched_at' => now(),
|
||||||
'title' => $result->title,
|
'title' => $result->title,
|
||||||
],
|
],
|
||||||
|
PageStatusEnum::Failed => [
|
||||||
|
'status' => $status,
|
||||||
|
'failed_at' => now(),
|
||||||
|
],
|
||||||
|
PageStatusEnum::Rejected => [
|
||||||
|
'status' => $status,
|
||||||
|
],
|
||||||
|
PageStatusEnum::Discovered => [
|
||||||
|
'status' => $status,
|
||||||
|
],
|
||||||
};
|
};
|
||||||
|
|
||||||
$this->pageCrawl->page->update($update);
|
$this->pageCrawl->page->update($update);
|
||||||
|
|
||||||
if ($result->outcome !== CrawlOutcomeEnum::Failed) {
|
|
||||||
$result->outboundLinks->each(fn (string $url) => $register($url));
|
|
||||||
}
|
|
||||||
|
|
||||||
if (in_array($result->outcome, [
|
|
||||||
CrawlOutcomeEnum::Failed,
|
|
||||||
CrawlOutcomeEnum::Timeout,
|
|
||||||
CrawlOutcomeEnum::Blocked5xx,
|
|
||||||
])) {
|
|
||||||
$this->scheduleRetryIfNeeded($result, $this->pageCrawl);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private function scheduleRetryIfNeeded(FetchResult $result, PageCrawl $crawl): void
|
private function scheduleRetryIfNeeded(): void
|
||||||
{
|
{
|
||||||
if (PageCrawl::where('page_id', $crawl->page_id)->count() >= 3) {
|
if (PageCrawl::where('page_id', $this->pageCrawl->page_id)->count() >= 3) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
$newRow = PageCrawl::withoutEvents(
|
$newRow = PageCrawl::withoutEvents(
|
||||||
fn () => PageCrawl::create(
|
fn () => PageCrawl::create(
|
||||||
array_merge($crawl->toArray(), [
|
array_merge($this->pageCrawl->toArray(), [
|
||||||
'outcome' => null,
|
'outcome' => null,
|
||||||
])
|
])
|
||||||
)
|
)
|
||||||
|
|
@ -101,24 +102,4 @@ private function scheduleRetryIfNeeded(FetchResult $result, PageCrawl $crawl): v
|
||||||
|
|
||||||
ProcessCrawlJob::dispatch($newRow)->delay(now()->addHour());
|
ProcessCrawlJob::dispatch($newRow)->delay(now()->addHour());
|
||||||
}
|
}
|
||||||
|
|
||||||
public function updatePageCrawl(FetchResult $result): void
|
|
||||||
{
|
|
||||||
$outcome = CrawlOutcomeEnum::Success;
|
|
||||||
$errorMessage = null;
|
|
||||||
$statusCode = 200;
|
|
||||||
|
|
||||||
if ($result->outcome === CrawlOutcomeEnum::Failed) {
|
|
||||||
$outcome = CrawlOutcomeEnum::Failed;
|
|
||||||
$errorMessage = $result->errorMessage;
|
|
||||||
$statusCode = null;
|
|
||||||
}
|
|
||||||
|
|
||||||
$this->pageCrawl->update([
|
|
||||||
'outcome' => $outcome,
|
|
||||||
'completed_at' => now(),
|
|
||||||
'status_code' => $statusCode,
|
|
||||||
'error_message' => $errorMessage,
|
|
||||||
]);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -5,6 +5,7 @@
|
||||||
namespace Tests\Unit\Enums;
|
namespace Tests\Unit\Enums;
|
||||||
|
|
||||||
use App\Enums\CrawlOutcomeEnum;
|
use App\Enums\CrawlOutcomeEnum;
|
||||||
|
use App\Enums\PageStatusEnum;
|
||||||
use Tests\TestCase;
|
use Tests\TestCase;
|
||||||
|
|
||||||
class CrawlOutcomeEnumTest extends TestCase
|
class CrawlOutcomeEnumTest extends TestCase
|
||||||
|
|
@ -33,4 +34,42 @@ public function test_enum_has_exactly_seven_cases(): void
|
||||||
{
|
{
|
||||||
$this->assertCount(7, CrawlOutcomeEnum::cases());
|
$this->assertCount(7, CrawlOutcomeEnum::cases());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public function test_to_page_status_maps_each_outcome_correctly(): void
|
||||||
|
{
|
||||||
|
$this->assertSame(PageStatusEnum::Fetched, CrawlOutcomeEnum::Success->toPageStatus());
|
||||||
|
$this->assertSame(PageStatusEnum::Rejected, CrawlOutcomeEnum::Rejected->toPageStatus());
|
||||||
|
$this->assertSame(PageStatusEnum::Failed, CrawlOutcomeEnum::Failed->toPageStatus());
|
||||||
|
$this->assertSame(PageStatusEnum::Failed, CrawlOutcomeEnum::Timeout->toPageStatus());
|
||||||
|
$this->assertSame(PageStatusEnum::Failed, CrawlOutcomeEnum::Blocked4xx->toPageStatus());
|
||||||
|
$this->assertSame(PageStatusEnum::Failed, CrawlOutcomeEnum::Blocked5xx->toPageStatus());
|
||||||
|
$this->assertSame(PageStatusEnum::Failed, CrawlOutcomeEnum::BlockedRobots->toPageStatus());
|
||||||
|
}
|
||||||
|
|
||||||
|
public function test_is_retryable_returns_true_only_for_transient_failures(): void
|
||||||
|
{
|
||||||
|
// Retryable: transient network/server problems that may resolve later
|
||||||
|
$this->assertTrue(CrawlOutcomeEnum::Failed->isRetryable());
|
||||||
|
$this->assertTrue(CrawlOutcomeEnum::Timeout->isRetryable());
|
||||||
|
$this->assertTrue(CrawlOutcomeEnum::Blocked5xx->isRetryable());
|
||||||
|
|
||||||
|
// Not retryable: success (done), permanent failures, or policy decisions
|
||||||
|
$this->assertFalse(CrawlOutcomeEnum::Success->isRetryable());
|
||||||
|
$this->assertFalse(CrawlOutcomeEnum::Rejected->isRetryable());
|
||||||
|
$this->assertFalse(CrawlOutcomeEnum::BlockedRobots->isRetryable());
|
||||||
|
$this->assertFalse(CrawlOutcomeEnum::Blocked4xx->isRetryable());
|
||||||
|
}
|
||||||
|
|
||||||
|
public function test_should_register_outbound_links_returns_true_only_for_success(): void
|
||||||
|
{
|
||||||
|
$this->assertTrue(CrawlOutcomeEnum::Success->shouldRegisterOutboundLinks());
|
||||||
|
|
||||||
|
// No links to register on any non-Success outcome
|
||||||
|
$this->assertFalse(CrawlOutcomeEnum::Failed->shouldRegisterOutboundLinks());
|
||||||
|
$this->assertFalse(CrawlOutcomeEnum::Timeout->shouldRegisterOutboundLinks());
|
||||||
|
$this->assertFalse(CrawlOutcomeEnum::Rejected->shouldRegisterOutboundLinks());
|
||||||
|
$this->assertFalse(CrawlOutcomeEnum::BlockedRobots->shouldRegisterOutboundLinks());
|
||||||
|
$this->assertFalse(CrawlOutcomeEnum::Blocked4xx->shouldRegisterOutboundLinks());
|
||||||
|
$this->assertFalse(CrawlOutcomeEnum::Blocked5xx->shouldRegisterOutboundLinks());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue