trove/app/Enums/CrawlOutcomeEnum.php

61 lines
1.9 KiB
PHP
Raw Normal View History

<?php
declare(strict_types=1);
namespace App\Enums;
enum CrawlOutcomeEnum: string
{
case Success = 'success';
case Failed = 'failed';
case Timeout = 'timeout';
case BlockedRobots = 'blocked_robots';
case Blocked4xx = 'blocked_4xx';
case Blocked5xx = 'blocked_5xx';
/**
* The HTTP fetch succeeded (2xx) but the response is unindexable in v0.1
* (non-HTML Content-Type). Worker MUST also write `pages.status = Rejected`
* on this outcome do NOT treat as Failed. Page row STAYS in the DB to
* prevent re-discovery loops as fediverse re-shares the URL.
*/
case Rejected = 'rejected';
/**
* The PageStatusEnum value the parent `pages` row should land on for this outcome.
*/
public function toPageStatus(): PageStatusEnum
{
return match ($this) {
self::Success => PageStatusEnum::Fetched,
self::Rejected => PageStatusEnum::Rejected,
self::Failed,
self::Timeout,
self::BlockedRobots,
self::Blocked4xx,
self::Blocked5xx => PageStatusEnum::Failed,
};
}
/**
* True if the worker should retry this outcome (transient failures only).
* Permanent failures (4xx, robots block, rejected content type) and successes do not retry.
*/
public function isRetryable(): bool
{
return match ($this) {
self::Failed, self::Timeout, self::Blocked5xx => true,
self::Success, self::Rejected, self::BlockedRobots, self::Blocked4xx => false,
};
}
/**
* True if the worker should register the outbound links discovered during the fetch.
* Only Success outcomes have meaningful links; everything else either failed or returned no usable HTML.
*/
public function shouldRegisterOutboundLinks(): bool
{
return $this === self::Success;
}
}