111 lines
3.2 KiB
PHP
111 lines
3.2 KiB
PHP
<?php
|
|
|
|
namespace App\Jobs;
|
|
|
|
use App\Actions\FetchPageAction;
|
|
use App\Actions\RegisterDiscoveredPageAction;
|
|
use App\Enums\CrawlOutcomeEnum;
|
|
use App\Enums\PageStatusEnum;
|
|
use App\Models\PageCrawl;
|
|
use App\ValueObjects\FetchResult;
|
|
use Illuminate\Contracts\Queue\ShouldQueue;
|
|
use Illuminate\Foundation\Queue\Queueable;
|
|
|
|
class ProcessCrawlJob implements ShouldQueue
|
|
{
|
|
use Queueable;
|
|
|
|
public function __construct(
|
|
public PageCrawl $pageCrawl,
|
|
) {}
|
|
|
|
public function handle(
|
|
FetchPageAction $fetcher,
|
|
RegisterDiscoveredPageAction $register,
|
|
): void {
|
|
/** @var FetchResult $result */
|
|
$result = $fetcher($this->pageCrawl->page->url);
|
|
|
|
$this->updatePageCrawl($result);
|
|
|
|
$update = match ($result->outcome) {
|
|
CrawlOutcomeEnum::Rejected => [
|
|
'status' => PageStatusEnum::Rejected,
|
|
'fetched_at' => null,
|
|
],
|
|
CrawlOutcomeEnum::Timeout => [
|
|
'status' => PageStatusEnum::Failed,
|
|
'failed_at' => now(),
|
|
],
|
|
CrawlOutcomeEnum::Failed => [
|
|
'status' => PageStatusEnum::Failed,
|
|
],
|
|
CrawlOutcomeEnum::Blocked4xx => [
|
|
'status' => PageStatusEnum::Failed,
|
|
'failed_at' => now(),
|
|
],
|
|
CrawlOutcomeEnum::Blocked5xx => [
|
|
'status' => PageStatusEnum::Failed,
|
|
],
|
|
CrawlOutcomeEnum::BlockedRobots => [
|
|
'status' => PageStatusEnum::Failed,
|
|
],
|
|
default => [
|
|
'status' => PageStatusEnum::Fetched,
|
|
'fetched_at' => now(),
|
|
'title' => $result->title,
|
|
],
|
|
};
|
|
|
|
$this->pageCrawl->page->update($update);
|
|
|
|
if ($result->outcome !== CrawlOutcomeEnum::Failed) {
|
|
$result->outboundLinks->each(fn (string $url) => $register($url));
|
|
}
|
|
|
|
if (in_array($result->outcome, [
|
|
CrawlOutcomeEnum::Failed,
|
|
CrawlOutcomeEnum::Timeout,
|
|
CrawlOutcomeEnum::Blocked5xx,
|
|
])) {
|
|
$this->scheduleRetryIfNeeded($result, $this->pageCrawl);
|
|
}
|
|
}
|
|
|
|
private function scheduleRetryIfNeeded(FetchResult $result, PageCrawl $crawl): void
|
|
{
|
|
if (PageCrawl::where('page_id', $crawl->page_id)->count() >= 3) {
|
|
return;
|
|
}
|
|
|
|
$newRow = PageCrawl::withoutEvents(
|
|
fn () => PageCrawl::create(
|
|
array_merge($crawl->toArray(), [
|
|
'outcome' => null,
|
|
])
|
|
)
|
|
);
|
|
|
|
ProcessCrawlJob::dispatch($newRow)->delay(now()->addHour());
|
|
}
|
|
|
|
public function updatePageCrawl(FetchResult $result): void
|
|
{
|
|
$outcome = CrawlOutcomeEnum::Success;
|
|
$errorMessage = null;
|
|
$statusCode = 200;
|
|
|
|
if ($result->outcome === CrawlOutcomeEnum::Failed) {
|
|
$outcome = CrawlOutcomeEnum::Failed;
|
|
$errorMessage = $result->errorMessage;
|
|
$statusCode = null;
|
|
}
|
|
|
|
$this->pageCrawl->update([
|
|
'outcome' => $outcome,
|
|
'completed_at' => now(),
|
|
'status_code' => $statusCode,
|
|
'error_message' => $errorMessage,
|
|
]);
|
|
}
|
|
}
|