2026-04-26 13:06:22 +02:00
|
|
|
<?php
|
|
|
|
|
|
|
|
|
|
declare(strict_types=1);
|
|
|
|
|
|
|
|
|
|
namespace App\Enums;
|
|
|
|
|
|
|
|
|
|
enum CrawlOutcomeEnum: string
|
|
|
|
|
{
|
|
|
|
|
case Success = 'success';
|
|
|
|
|
case Failed = 'failed';
|
|
|
|
|
case Timeout = 'timeout';
|
|
|
|
|
case BlockedRobots = 'blocked_robots';
|
|
|
|
|
case Blocked4xx = 'blocked_4xx';
|
|
|
|
|
case Blocked5xx = 'blocked_5xx';
|
2026-04-26 19:49:08 +02:00
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* The HTTP fetch succeeded (2xx) but the response is unindexable in v0.1
|
|
|
|
|
* (non-HTML Content-Type). Worker MUST also write `pages.status = Rejected`
|
|
|
|
|
* on this outcome — do NOT treat as Failed. Page row STAYS in the DB to
|
|
|
|
|
* prevent re-discovery loops as fediverse re-shares the URL.
|
|
|
|
|
*/
|
2026-04-26 16:35:46 +02:00
|
|
|
case Rejected = 'rejected';
|
2026-04-27 01:36:37 +02:00
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* The PageStatusEnum value the parent `pages` row should land on for this outcome.
|
|
|
|
|
*/
|
|
|
|
|
public function toPageStatus(): PageStatusEnum
|
|
|
|
|
{
|
|
|
|
|
return match ($this) {
|
|
|
|
|
self::Success => PageStatusEnum::Fetched,
|
|
|
|
|
self::Rejected => PageStatusEnum::Rejected,
|
|
|
|
|
self::Failed,
|
|
|
|
|
self::Timeout,
|
|
|
|
|
self::BlockedRobots,
|
|
|
|
|
self::Blocked4xx,
|
|
|
|
|
self::Blocked5xx => PageStatusEnum::Failed,
|
|
|
|
|
};
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* True if the worker should retry this outcome (transient failures only).
|
|
|
|
|
* Permanent failures (4xx, robots block, rejected content type) and successes do not retry.
|
|
|
|
|
*/
|
|
|
|
|
public function isRetryable(): bool
|
|
|
|
|
{
|
|
|
|
|
return match ($this) {
|
|
|
|
|
self::Failed, self::Timeout, self::Blocked5xx => true,
|
|
|
|
|
self::Success, self::Rejected, self::BlockedRobots, self::Blocked4xx => false,
|
|
|
|
|
};
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* True if the worker should register the outbound links discovered during the fetch.
|
|
|
|
|
* Only Success outcomes have meaningful links; everything else either failed or returned no usable HTML.
|
|
|
|
|
*/
|
|
|
|
|
public function shouldRegisterOutboundLinks(): bool
|
|
|
|
|
{
|
|
|
|
|
return $this === self::Success;
|
|
|
|
|
}
|
2026-04-26 13:06:22 +02:00
|
|
|
}
|