isAllowed($this->pageCrawl->page->url)) { $this->pageCrawl->update([ 'outcome' => CrawlOutcomeEnum::BlockedRobots, 'completed_at' => now(), ]); $this->pageCrawl->page->update(['status' => PageStatusEnum::Failed]); return; } $fetcher = resolve(FetchPageAction::class); $register = resolve(RegisterDiscoveredPageAction::class); $politenessService = resolve(PolitenessService::class); $delay = $politenessService->minDelayFor($this->pageCrawl->domain); $lock = Cache::lock("crawler:domain:{$this->pageCrawl->domain}", $delay); if (! $lock->get()) { $this->release($delay); return; } $result = $fetcher($this->pageCrawl->page->url); $this->writeOutcome($result); $this->updatePageStatus($result); if ($result->outcome->shouldRegisterOutboundLinks()) { $result->outboundLinks->each(fn (string $url) => $register($url)); } if ($result->outcome->isRetryable()) { $this->scheduleRetryIfNeeded(); } } private function writeOutcome(FetchResult $result): void { $this->pageCrawl->update([ 'outcome' => $result->outcome, 'completed_at' => now(), 'status_code' => $result->statusCode, 'error_message' => $result->errorMessage, ]); } private function updatePageStatus(FetchResult $result): void { $status = $result->outcome->toPageStatus(); $update = match ($status) { PageStatusEnum::Fetched => [ 'status' => $status, 'fetched_at' => now(), 'title' => $result->title, // Sticky language: only write when detection produced a value, so a re-crawl // returning null doesn't erase a previously-detected language. Guarding on // language alone is sufficient because FetchPageAction::detectLanguage() // always returns the pair as both-null or both-non-null (never mixed). ...($result->language !== null ? [ 'language' => $result->language, 'language_confidence' => $result->languageConfidence, ] : []), ], PageStatusEnum::Failed => [ 'status' => $status, 'failed_at' => now(), ], PageStatusEnum::Rejected => [ 'status' => $status, ], PageStatusEnum::Discovered => [ 'status' => $status, ], }; $this->pageCrawl->page->update($update); } private function scheduleRetryIfNeeded(): void { if (PageCrawl::where('page_id', $this->pageCrawl->page_id)->count() >= 3) { return; } $newRow = PageCrawl::withoutEvents( fn () => PageCrawl::create( array_merge($this->pageCrawl->toArray(), [ 'outcome' => null, ]) ) ); ProcessCrawlJob::dispatch($newRow)->delay(now()->addHour()); } }