trove/app/Jobs/ProcessCrawlJob.php

125 lines
3.6 KiB
PHP
Raw Normal View History

<?php
namespace App\Jobs;
use App\Actions\FetchPageAction;
use App\Actions\RegisterDiscoveredPageAction;
use App\Enums\CrawlOutcomeEnum;
use App\Enums\PageStatusEnum;
use App\Models\PageCrawl;
use App\Services\PolitenessService;
use App\ValueObjects\FetchResult;
use Illuminate\Contracts\Queue\ShouldQueue;
use Illuminate\Foundation\Queue\Queueable;
use Illuminate\Support\Facades\Cache;
class ProcessCrawlJob implements ShouldQueue
{
use Queueable;
public function __construct(
public PageCrawl $pageCrawl,
) {}
public function handle(): void
{
$fetcher = resolve(FetchPageAction::class);
$register = resolve(RegisterDiscoveredPageAction::class);
$politenessService = resolve(PolitenessService::class);
$delay = $politenessService->minDelayFor($this->pageCrawl->domain);
$lock = Cache::lock("crawler:domain:{$this->pageCrawl->domain}", $delay);
if (! $lock->get()) {
$this->release($delay);
return;
}
/** @var FetchResult $result */
$result = $fetcher($this->pageCrawl->page->url);
$this->updatePageCrawl($result);
$update = match ($result->outcome) {
CrawlOutcomeEnum::Rejected => [
'status' => PageStatusEnum::Rejected,
'fetched_at' => null,
],
CrawlOutcomeEnum::Timeout => [
'status' => PageStatusEnum::Failed,
'failed_at' => now(),
],
CrawlOutcomeEnum::Failed => [
'status' => PageStatusEnum::Failed,
],
CrawlOutcomeEnum::Blocked4xx => [
'status' => PageStatusEnum::Failed,
'failed_at' => now(),
],
CrawlOutcomeEnum::Blocked5xx => [
'status' => PageStatusEnum::Failed,
],
CrawlOutcomeEnum::BlockedRobots => [
'status' => PageStatusEnum::Failed,
],
default => [
'status' => PageStatusEnum::Fetched,
'fetched_at' => now(),
'title' => $result->title,
],
};
$this->pageCrawl->page->update($update);
if ($result->outcome !== CrawlOutcomeEnum::Failed) {
$result->outboundLinks->each(fn (string $url) => $register($url));
}
if (in_array($result->outcome, [
CrawlOutcomeEnum::Failed,
CrawlOutcomeEnum::Timeout,
CrawlOutcomeEnum::Blocked5xx,
])) {
$this->scheduleRetryIfNeeded($result, $this->pageCrawl);
}
}
private function scheduleRetryIfNeeded(FetchResult $result, PageCrawl $crawl): void
{
if (PageCrawl::where('page_id', $crawl->page_id)->count() >= 3) {
return;
}
$newRow = PageCrawl::withoutEvents(
fn () => PageCrawl::create(
array_merge($crawl->toArray(), [
'outcome' => null,
])
)
);
ProcessCrawlJob::dispatch($newRow)->delay(now()->addHour());
}
public function updatePageCrawl(FetchResult $result): void
{
$outcome = CrawlOutcomeEnum::Success;
$errorMessage = null;
$statusCode = 200;
if ($result->outcome === CrawlOutcomeEnum::Failed) {
$outcome = CrawlOutcomeEnum::Failed;
$errorMessage = $result->errorMessage;
$statusCode = null;
}
$this->pageCrawl->update([
'outcome' => $outcome,
'completed_at' => now(),
'status_code' => $statusCode,
'error_message' => $errorMessage,
]);
}
}