105 lines
2.8 KiB
PHP
105 lines
2.8 KiB
PHP
<?php
|
|
|
|
declare(strict_types=1);
|
|
|
|
namespace App\Jobs;
|
|
|
|
use App\Actions\FetchPageAction;
|
|
use App\Actions\RegisterDiscoveredPageAction;
|
|
use App\Enums\PageStatusEnum;
|
|
use App\Models\PageCrawl;
|
|
use App\Services\PolitenessService;
|
|
use App\ValueObjects\FetchResult;
|
|
use Illuminate\Contracts\Queue\ShouldQueue;
|
|
use Illuminate\Foundation\Queue\Queueable;
|
|
use Illuminate\Support\Facades\Cache;
|
|
|
|
class ProcessCrawlJob implements ShouldQueue
|
|
{
|
|
use Queueable;
|
|
|
|
public function __construct(
|
|
public PageCrawl $pageCrawl,
|
|
) {}
|
|
|
|
public function handle(): void
|
|
{
|
|
$fetcher = resolve(FetchPageAction::class);
|
|
$register = resolve(RegisterDiscoveredPageAction::class);
|
|
$politenessService = resolve(PolitenessService::class);
|
|
|
|
$delay = $politenessService->minDelayFor($this->pageCrawl->domain);
|
|
$lock = Cache::lock("crawler:domain:{$this->pageCrawl->domain}", $delay);
|
|
|
|
if (! $lock->get()) {
|
|
$this->release($delay);
|
|
|
|
return;
|
|
}
|
|
|
|
$result = $fetcher($this->pageCrawl->page->url);
|
|
|
|
$this->writeOutcome($result);
|
|
$this->updatePageStatus($result);
|
|
|
|
if ($result->outcome->shouldRegisterOutboundLinks()) {
|
|
$result->outboundLinks->each(fn (string $url) => $register($url));
|
|
}
|
|
|
|
if ($result->outcome->isRetryable()) {
|
|
$this->scheduleRetryIfNeeded();
|
|
}
|
|
}
|
|
|
|
private function writeOutcome(FetchResult $result): void
|
|
{
|
|
$this->pageCrawl->update([
|
|
'outcome' => $result->outcome,
|
|
'completed_at' => now(),
|
|
'status_code' => $result->statusCode,
|
|
'error_message' => $result->errorMessage,
|
|
]);
|
|
}
|
|
|
|
private function updatePageStatus(FetchResult $result): void
|
|
{
|
|
$status = $result->outcome->toPageStatus();
|
|
|
|
$update = match ($status) {
|
|
PageStatusEnum::Fetched => [
|
|
'status' => $status,
|
|
'fetched_at' => now(),
|
|
'title' => $result->title,
|
|
],
|
|
PageStatusEnum::Failed => [
|
|
'status' => $status,
|
|
'failed_at' => now(),
|
|
],
|
|
PageStatusEnum::Rejected => [
|
|
'status' => $status,
|
|
],
|
|
PageStatusEnum::Discovered => [
|
|
'status' => $status,
|
|
],
|
|
};
|
|
|
|
$this->pageCrawl->page->update($update);
|
|
}
|
|
|
|
private function scheduleRetryIfNeeded(): void
|
|
{
|
|
if (PageCrawl::where('page_id', $this->pageCrawl->page_id)->count() >= 3) {
|
|
return;
|
|
}
|
|
|
|
$newRow = PageCrawl::withoutEvents(
|
|
fn () => PageCrawl::create(
|
|
array_merge($this->pageCrawl->toArray(), [
|
|
'outcome' => null,
|
|
])
|
|
)
|
|
);
|
|
|
|
ProcessCrawlJob::dispatch($newRow)->delay(now()->addHour());
|
|
}
|
|
}
|