trove/app/Jobs/ProcessCrawlJob.php

121 lines
3.4 KiB
PHP

<?php
declare(strict_types=1);
namespace App\Jobs;
use App\Actions\FetchPageAction;
use App\Actions\RegisterDiscoveredPageAction;
use App\Enums\CrawlOutcomeEnum;
use App\Enums\PageStatusEnum;
use App\Models\PageCrawl;
use App\Services\PolitenessService;
use App\Services\RobotsService;
use App\ValueObjects\FetchResult;
use Illuminate\Contracts\Queue\ShouldQueue;
use Illuminate\Foundation\Queue\Queueable;
use Illuminate\Support\Facades\Cache;
class ProcessCrawlJob implements ShouldQueue
{
use Queueable;
public function __construct(
public PageCrawl $pageCrawl,
) {}
public function handle(): void
{
$robotsService = resolve(RobotsService::class);
if (! $robotsService->isAllowed($this->pageCrawl->page->url)) {
$this->pageCrawl->update([
'outcome' => CrawlOutcomeEnum::BlockedRobots,
'completed_at' => now(),
]);
$this->pageCrawl->page->update(['status' => PageStatusEnum::Failed]);
return;
}
$fetcher = resolve(FetchPageAction::class);
$register = resolve(RegisterDiscoveredPageAction::class);
$politenessService = resolve(PolitenessService::class);
$delay = $politenessService->minDelayFor($this->pageCrawl->domain);
$lock = Cache::lock("crawler:domain:{$this->pageCrawl->domain}", $delay);
if (! $lock->get()) {
$this->release($delay);
return;
}
$result = $fetcher($this->pageCrawl->page->url);
$this->writeOutcome($result);
$this->updatePageStatus($result);
if ($result->outcome->shouldRegisterOutboundLinks()) {
$result->outboundLinks->each(fn (string $url) => $register($url));
}
if ($result->outcome->isRetryable()) {
$this->scheduleRetryIfNeeded();
}
}
private function writeOutcome(FetchResult $result): void
{
$this->pageCrawl->update([
'outcome' => $result->outcome,
'completed_at' => now(),
'status_code' => $result->statusCode,
'error_message' => $result->errorMessage,
]);
}
private function updatePageStatus(FetchResult $result): void
{
$status = $result->outcome->toPageStatus();
$update = match ($status) {
PageStatusEnum::Fetched => [
'status' => $status,
'fetched_at' => now(),
'title' => $result->title,
'language' => $result->language,
'language_confidence' => $result->languageConfidence,
],
PageStatusEnum::Failed => [
'status' => $status,
'failed_at' => now(),
],
PageStatusEnum::Rejected => [
'status' => $status,
],
PageStatusEnum::Discovered => [
'status' => $status,
],
};
$this->pageCrawl->page->update($update);
}
private function scheduleRetryIfNeeded(): void
{
if (PageCrawl::where('page_id', $this->pageCrawl->page_id)->count() >= 3) {
return;
}
$newRow = PageCrawl::withoutEvents(
fn () => PageCrawl::create(
array_merge($this->pageCrawl->toArray(), [
'outcome' => null,
])
)
);
ProcessCrawlJob::dispatch($newRow)->delay(now()->addHour());
}
}