84 lines
2.4 KiB
PHP
84 lines
2.4 KiB
PHP
<?php
|
|
|
|
namespace App\Jobs;
|
|
|
|
use App\Actions\FetchPageAction;
|
|
use App\Actions\RegisterDiscoveredPageAction;
|
|
use App\Enums\CrawlOutcomeEnum;
|
|
use App\Enums\PageStatusEnum;
|
|
use App\Models\PageCrawl;
|
|
use App\ValueObjects\FetchResult;
|
|
use Illuminate\Contracts\Queue\ShouldQueue;
|
|
use Illuminate\Foundation\Queue\Queueable;
|
|
|
|
class ProcessCrawlJob implements ShouldQueue
|
|
{
|
|
use Queueable;
|
|
|
|
public function __construct(
|
|
public PageCrawl $pageCrawl,
|
|
) {}
|
|
|
|
public function handle(
|
|
FetchPageAction $fetcher,
|
|
RegisterDiscoveredPageAction $register,
|
|
): void {
|
|
/** @var FetchResult $result */
|
|
$result = $fetcher($this->pageCrawl->page->url);
|
|
|
|
$this->pageCrawl->update([
|
|
'outcome' => CrawlOutcomeEnum::Success,
|
|
'completed_at' => now(),
|
|
'status_code' => 200,
|
|
]);
|
|
|
|
$update = match ($result->outcome) {
|
|
CrawlOutcomeEnum::Rejected => [
|
|
'status' => PageStatusEnum::Rejected,
|
|
'fetched_at' => null,
|
|
],
|
|
CrawlOutcomeEnum::Timeout => [
|
|
'status' => PageStatusEnum::Failed,
|
|
'failed_at' => now(),
|
|
],
|
|
CrawlOutcomeEnum::Blocked4xx => [
|
|
'status' => PageStatusEnum::Failed,
|
|
'failed_at' => now(),
|
|
],
|
|
default => [
|
|
'status' => PageStatusEnum::Fetched,
|
|
'fetched_at' => now(),
|
|
'title' => $result->title,
|
|
],
|
|
};
|
|
|
|
$result->outboundLinks->each(fn (string $url) => $register($url));
|
|
|
|
$this->pageCrawl->page->update($update);
|
|
|
|
if (in_array($result->outcome, [
|
|
CrawlOutcomeEnum::Failed,
|
|
CrawlOutcomeEnum::Timeout,
|
|
CrawlOutcomeEnum::Blocked5xx,
|
|
])) {
|
|
$this->scheduleRetryIfNeeded($result, $this->pageCrawl);
|
|
}
|
|
}
|
|
|
|
private function scheduleRetryIfNeeded(FetchResult $result, PageCrawl $crawl): void
|
|
{
|
|
if (PageCrawl::where('page_id', $crawl->page_id)->count() >= 3) {
|
|
return;
|
|
}
|
|
|
|
$newRow = PageCrawl::withoutEvents(
|
|
fn () => PageCrawl::create(
|
|
array_merge($crawl->toArray(), [
|
|
'outcome' => null,
|
|
])
|
|
)
|
|
);
|
|
|
|
ProcessCrawlJob::dispatch($newRow)->delay(now()->addHour());
|
|
}
|
|
}
|