trove/app/Actions/FetchPageAction.php

194 lines
6 KiB
PHP

<?php
declare(strict_types=1);
namespace App\Actions;
use App\Enums\CrawlOutcomeEnum;
use App\Services\LanguageDetectionService;
use App\Services\UrlService;
use App\ValueObjects\FetchResult;
use fivefilters\Readability\Configuration;
use fivefilters\Readability\Readability;
use GuzzleHttp\Exception\ConnectException;
use Illuminate\Http\Client\ConnectionException;
use Illuminate\Http\Client\Factory;
use Illuminate\Http\Client\Response;
use InvalidArgumentException;
use League\Uri\BaseUri;
use Symfony\Component\DomCrawler\Crawler;
use Throwable;
class FetchPageAction
{
private const MIN_WORDS_FOR_TEXT_DETECTION = 20;
private const MIN_TEXT_DETECTION_CONFIDENCE = 0.30;
public function __construct(
private Factory $http,
private UrlService $urlService,
private LanguageDetectionService $languageDetection,
) {}
public function __invoke(string $url): FetchResult
{
try {
$response = $this->http
->timeout(config('crawler.timeout'))
->withHeaders([
'User-Agent' => config('crawler.user_agent'),
'Accept' => 'text/html',
])
->withOptions([
'allow_redirects' => ['max' => config('crawler.max_redirects')],
])
->get($url);
} catch (ConnectionException|ConnectException $e) {
return $this->failureResult($e);
}
[$outcome, $error] = $this->validateResponse($response);
if ($outcome === CrawlOutcomeEnum::Success) {
[$title, $extractedText, $links, $crawler] = $this->extractTitleTextAndLinks($response->body(), $url);
$wordCount = $extractedText !== '' ? count(preg_split('/\s+/u', trim($extractedText))) : 0;
[$language, $languageConfidence] = $this->detectLanguage($crawler, $extractedText, $wordCount);
}
return new FetchResult(
outcome: $outcome,
statusCode: $response->status(),
finalUrl: $url,
title: $title ?? null,
extractedText: $extractedText ?? null,
outboundLinks: $links ?? collect(),
wordCount: $wordCount ?? null,
errorMessage: $error ?? null,
language: $language ?? null,
languageConfidence: $languageConfidence ?? null,
);
}
private function validateResponse(Response $response): array
{
$status = $response->status();
if ($status >= 400 && $status < 500) {
return [CrawlOutcomeEnum::Blocked4xx, "HTTP {$status}"];
}
if ($status >= 500) {
return [CrawlOutcomeEnum::Blocked5xx, "HTTP {$status}"];
}
$contentType = $response->header('Content-Type');
if (! str_starts_with(mb_strtolower($contentType), 'text/html')) {
return [CrawlOutcomeEnum::Rejected, "Unsupported file type: {$contentType}"];
}
return [CrawlOutcomeEnum::Success, null];
}
private function failureResult(ConnectionException|ConnectException $e): FetchResult
{
$guzzleException = $e instanceof ConnectException
? $e
: ($e->getPrevious() instanceof ConnectException
? $e->getPrevious()
: null);
$errno = $guzzleException?->getHandlerContext()['errno'] ?? null;
$outcome = $errno === CURLE_OPERATION_TIMEDOUT
? CrawlOutcomeEnum::Timeout
: CrawlOutcomeEnum::Failed;
return new FetchResult(
outcome: $outcome,
statusCode: null,
finalUrl: null,
title: null,
extractedText: null,
outboundLinks: collect(),
wordCount: null,
errorMessage: $e->getMessage(),
);
}
private function extractTitleTextAndLinks(string $body, string $url): array
{
$crawler = new Crawler($body);
$title = $crawler->filter('title')->count() > 0
? trim($crawler->filter('title')->text())
: null;
$readability = new Readability(new Configuration);
$readability->parse($body);
$mainContent = $readability->getContent() ?? '';
$extractedText = trim(strip_tags($mainContent));
$links = collect();
if ($mainContent !== '') {
$linkCrawler = new Crawler($mainContent);
if ($linkCrawler->filter('a[href]')->count() > 0) {
$links = collect($linkCrawler->filter('a[href]')->extract(['href']));
}
}
$linksResolved = $links
->map(fn (string $href) => $this->resolveAndValidateLink($href, $url))
->filter()
->unique()
->values();
return [$title, $extractedText, $linksResolved, $crawler];
}
private function resolveAndValidateLink(string $href, string $finalUrl): ?string
{
try {
$resolved = (string) BaseUri::from($finalUrl)->resolve($href);
$resolved = strstr($resolved, '#', true) ?: $resolved;
} catch (Throwable) {
return null;
}
if ($resolved === $finalUrl) {
return null;
}
try {
$this->urlService->host($resolved);
} catch (InvalidArgumentException) {
return null;
}
return $resolved;
}
/**
* @return array{0: ?string, 1: ?float}
*/
private function detectLanguage(Crawler $crawler, string $extractedText, ?int $wordCount = null): array
{
if ($wordCount >= self::MIN_WORDS_FOR_TEXT_DETECTION) {
$result = $this->languageDetection->detect($extractedText);
if ($result !== null && $result[1] >= self::MIN_TEXT_DETECTION_CONFIDENCE) {
return [$result[0], $result[1]];
}
}
$lang = $crawler->filter('html')->count() > 0
? trim($crawler->filter('html')->attr('lang') ?? '')
: '';
if ($lang !== '' && strlen($lang) <= 35) {
return [$lang, 1.0];
}
return [null, null];
}
}