http ->timeout(config('crawler.timeout')) ->withHeaders([ 'User-Agent' => config('crawler.user_agent'), 'Accept' => 'text/html', ]) ->withOptions([ 'allow_redirects' => ['max' => config('crawler.max_redirects')], ]) ->get($url); } catch (ConnectionException|ConnectException $e) { return $this->failureResult($e); } [$outcome, $error] = $this->validateResponse($response); if ($outcome === CrawlOutcomeEnum::Success) { [$title, $extractedText, $links] = $this->extractTitleTextAndLinks($response->body(), $url); $wordCount = $extractedText !== '' ? count(preg_split('/\s+/u', trim($extractedText))) : 0; } return new FetchResult( outcome: $outcome, statusCode: $response->status(), finalUrl: $url, title: $title ?? null, extractedText: $extractedText ?? null, outboundLinks: $links ?? collect(), wordCount: $wordCount ?? null, errorMessage: $error ?? null, ); } private function validateResponse(Response $response): array { $status = $response->status(); if ($status >= 400 && $status < 500) { return [CrawlOutcomeEnum::Blocked4xx, "HTTP {$status}"]; } if ($status >= 500) { return [CrawlOutcomeEnum::Blocked5xx, "HTTP {$status}"]; } $contentType = $response->header('Content-Type'); if (! str_starts_with(mb_strtolower($contentType), 'text/html')) { return [CrawlOutcomeEnum::Rejected, "Unsupported file type: {$contentType}"]; } return [CrawlOutcomeEnum::Success, null]; } private function failureResult(ConnectionException|ConnectException $e): FetchResult { $guzzleException = $e instanceof ConnectException ? $e : ($e->getPrevious() instanceof ConnectException ? $e->getPrevious() : null); $errno = $guzzleException?->getHandlerContext()['errno'] ?? null; $outcome = $errno === CURLE_OPERATION_TIMEDOUT ? CrawlOutcomeEnum::Timeout : CrawlOutcomeEnum::Failed; return new FetchResult( outcome: $outcome, statusCode: null, finalUrl: null, title: null, extractedText: null, outboundLinks: collect(), wordCount: null, errorMessage: $e->getMessage(), ); } private function extractTitleTextAndLinks(string $body, string $url): array { $crawler = new Crawler($body); $title = $crawler->filter('title')->count() > 0 ? trim($crawler->filter('title')->text()) : null; $readability = new Readability(new Configuration); $readability->parse($body); $mainContent = $readability->getContent() ?? ''; $extractedText = trim(strip_tags($mainContent)); $links = collect(); if ($mainContent !== '') { $linkCrawler = new Crawler($mainContent); if ($linkCrawler->filter('a[href]')->count() > 0) { $links = collect($linkCrawler->filter('a[href]')->extract(['href'])); } } $linksResolved = $links ->map(fn (string $href) => $this->resolveAndValidateLink($href, $url)) ->filter() ->unique() ->values(); return [$title, $extractedText, $linksResolved]; } private function resolveAndValidateLink(string $href, string $finalUrl): ?string { try { $resolved = (string) BaseUri::from($finalUrl)->resolve($href); $resolved = strstr($resolved, '#', true) ?: $resolved; } catch (Throwable) { return null; } if ($resolved === $finalUrl) { return null; } try { $this->urlService->host($resolved); } catch (InvalidArgumentException) { return null; } return $resolved; } }