trove/app/Jobs/ProcessCrawlJob.php

85 lines
2.4 KiB
PHP
Raw Normal View History

<?php
namespace App\Jobs;
use App\Actions\FetchPageAction;
use App\Actions\RegisterDiscoveredPageAction;
use App\Enums\CrawlOutcomeEnum;
use App\Enums\PageStatusEnum;
use App\Models\PageCrawl;
use App\ValueObjects\FetchResult;
use Illuminate\Contracts\Queue\ShouldQueue;
use Illuminate\Foundation\Queue\Queueable;
class ProcessCrawlJob implements ShouldQueue
{
use Queueable;
public function __construct(
public PageCrawl $pageCrawl,
) {}
public function handle(
FetchPageAction $fetcher,
RegisterDiscoveredPageAction $register,
): void {
/** @var FetchResult $result */
$result = $fetcher($this->pageCrawl->page->url);
$this->pageCrawl->update([
'outcome' => CrawlOutcomeEnum::Success,
'completed_at' => now(),
'status_code' => 200,
]);
$update = match ($result->outcome) {
CrawlOutcomeEnum::Rejected => [
'status' => PageStatusEnum::Rejected,
'fetched_at' => null,
],
CrawlOutcomeEnum::Timeout => [
'status' => PageStatusEnum::Failed,
'failed_at' => now(),
],
CrawlOutcomeEnum::Blocked4xx => [
'status' => PageStatusEnum::Failed,
'failed_at' => now(),
],
default => [
'status' => PageStatusEnum::Fetched,
'fetched_at' => now(),
'title' => $result->title,
],
};
$result->outboundLinks->each(fn (string $url) => $register($url));
$this->pageCrawl->page->update($update);
if (in_array($result->outcome, [
CrawlOutcomeEnum::Failed,
CrawlOutcomeEnum::Timeout,
CrawlOutcomeEnum::Blocked5xx,
])) {
$this->scheduleRetryIfNeeded($result, $this->pageCrawl);
}
}
private function scheduleRetryIfNeeded(FetchResult $result, PageCrawl $crawl): void
{
if (PageCrawl::where('page_id', $crawl->page_id)->count() >= 3) {
return;
}
$newRow = PageCrawl::withoutEvents(
fn () => PageCrawl::create(
array_merge($crawl->toArray(), [
'outcome' => null,
])
)
);
ProcessCrawlJob::dispatch($newRow)->delay(now()->addHour());
}
}