2026-04-26 21:15:07 +02:00
|
|
|
<?php
|
|
|
|
|
|
|
|
|
|
namespace App\Jobs;
|
|
|
|
|
|
2026-04-26 23:50:57 +02:00
|
|
|
use App\Actions\FetchPageAction;
|
|
|
|
|
use App\Actions\RegisterDiscoveredPageAction;
|
|
|
|
|
use App\Enums\CrawlOutcomeEnum;
|
|
|
|
|
use App\Enums\PageStatusEnum;
|
2026-04-26 21:15:07 +02:00
|
|
|
use App\Models\PageCrawl;
|
2026-04-27 01:25:46 +02:00
|
|
|
use App\Services\PolitenessService;
|
2026-04-26 23:50:57 +02:00
|
|
|
use App\ValueObjects\FetchResult;
|
2026-04-26 21:15:07 +02:00
|
|
|
use Illuminate\Contracts\Queue\ShouldQueue;
|
|
|
|
|
use Illuminate\Foundation\Queue\Queueable;
|
2026-04-27 01:25:46 +02:00
|
|
|
use Illuminate\Support\Facades\Cache;
|
2026-04-26 21:15:07 +02:00
|
|
|
|
|
|
|
|
class ProcessCrawlJob implements ShouldQueue
|
|
|
|
|
{
|
|
|
|
|
use Queueable;
|
|
|
|
|
|
|
|
|
|
public function __construct(
|
|
|
|
|
public PageCrawl $pageCrawl,
|
|
|
|
|
) {}
|
|
|
|
|
|
2026-04-27 01:25:46 +02:00
|
|
|
public function handle(): void
|
|
|
|
|
{
|
|
|
|
|
$fetcher = resolve(FetchPageAction::class);
|
|
|
|
|
$register = resolve(RegisterDiscoveredPageAction::class);
|
|
|
|
|
$politenessService = resolve(PolitenessService::class);
|
|
|
|
|
|
|
|
|
|
$delay = $politenessService->minDelayFor($this->pageCrawl->domain);
|
|
|
|
|
$lock = Cache::lock("crawler:domain:{$this->pageCrawl->domain}", $delay);
|
|
|
|
|
|
|
|
|
|
if (! $lock->get()) {
|
|
|
|
|
$this->release($delay);
|
|
|
|
|
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
2026-04-26 23:50:57 +02:00
|
|
|
/** @var FetchResult $result */
|
|
|
|
|
$result = $fetcher($this->pageCrawl->page->url);
|
|
|
|
|
|
2026-04-27 00:18:34 +02:00
|
|
|
$this->updatePageCrawl($result);
|
2026-04-26 23:50:57 +02:00
|
|
|
|
|
|
|
|
$update = match ($result->outcome) {
|
|
|
|
|
CrawlOutcomeEnum::Rejected => [
|
|
|
|
|
'status' => PageStatusEnum::Rejected,
|
|
|
|
|
'fetched_at' => null,
|
|
|
|
|
],
|
|
|
|
|
CrawlOutcomeEnum::Timeout => [
|
|
|
|
|
'status' => PageStatusEnum::Failed,
|
|
|
|
|
'failed_at' => now(),
|
|
|
|
|
],
|
2026-04-27 00:18:34 +02:00
|
|
|
CrawlOutcomeEnum::Failed => [
|
|
|
|
|
'status' => PageStatusEnum::Failed,
|
|
|
|
|
],
|
2026-04-26 23:50:57 +02:00
|
|
|
CrawlOutcomeEnum::Blocked4xx => [
|
|
|
|
|
'status' => PageStatusEnum::Failed,
|
|
|
|
|
'failed_at' => now(),
|
|
|
|
|
],
|
2026-04-27 00:18:34 +02:00
|
|
|
CrawlOutcomeEnum::Blocked5xx => [
|
|
|
|
|
'status' => PageStatusEnum::Failed,
|
|
|
|
|
],
|
|
|
|
|
CrawlOutcomeEnum::BlockedRobots => [
|
|
|
|
|
'status' => PageStatusEnum::Failed,
|
|
|
|
|
],
|
2026-04-26 23:50:57 +02:00
|
|
|
default => [
|
|
|
|
|
'status' => PageStatusEnum::Fetched,
|
|
|
|
|
'fetched_at' => now(),
|
|
|
|
|
'title' => $result->title,
|
|
|
|
|
],
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
$this->pageCrawl->page->update($update);
|
|
|
|
|
|
2026-04-27 00:18:34 +02:00
|
|
|
if ($result->outcome !== CrawlOutcomeEnum::Failed) {
|
|
|
|
|
$result->outboundLinks->each(fn (string $url) => $register($url));
|
|
|
|
|
}
|
|
|
|
|
|
2026-04-26 23:50:57 +02:00
|
|
|
if (in_array($result->outcome, [
|
|
|
|
|
CrawlOutcomeEnum::Failed,
|
|
|
|
|
CrawlOutcomeEnum::Timeout,
|
|
|
|
|
CrawlOutcomeEnum::Blocked5xx,
|
|
|
|
|
])) {
|
|
|
|
|
$this->scheduleRetryIfNeeded($result, $this->pageCrawl);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private function scheduleRetryIfNeeded(FetchResult $result, PageCrawl $crawl): void
|
2026-04-26 21:15:07 +02:00
|
|
|
{
|
2026-04-26 23:50:57 +02:00
|
|
|
if (PageCrawl::where('page_id', $crawl->page_id)->count() >= 3) {
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$newRow = PageCrawl::withoutEvents(
|
|
|
|
|
fn () => PageCrawl::create(
|
|
|
|
|
array_merge($crawl->toArray(), [
|
|
|
|
|
'outcome' => null,
|
|
|
|
|
])
|
|
|
|
|
)
|
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
ProcessCrawlJob::dispatch($newRow)->delay(now()->addHour());
|
2026-04-26 21:15:07 +02:00
|
|
|
}
|
2026-04-27 00:18:34 +02:00
|
|
|
|
|
|
|
|
public function updatePageCrawl(FetchResult $result): void
|
|
|
|
|
{
|
|
|
|
|
$outcome = CrawlOutcomeEnum::Success;
|
|
|
|
|
$errorMessage = null;
|
|
|
|
|
$statusCode = 200;
|
|
|
|
|
|
|
|
|
|
if ($result->outcome === CrawlOutcomeEnum::Failed) {
|
|
|
|
|
$outcome = CrawlOutcomeEnum::Failed;
|
|
|
|
|
$errorMessage = $result->errorMessage;
|
|
|
|
|
$statusCode = null;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$this->pageCrawl->update([
|
|
|
|
|
'outcome' => $outcome,
|
|
|
|
|
'completed_at' => now(),
|
|
|
|
|
'status_code' => $statusCode,
|
|
|
|
|
'error_message' => $errorMessage,
|
|
|
|
|
]);
|
|
|
|
|
}
|
2026-04-26 21:15:07 +02:00
|
|
|
}
|