158 lines
4.8 KiB
PHP
158 lines
4.8 KiB
PHP
<?php
|
|
|
|
declare(strict_types=1);
|
|
|
|
namespace App\Actions;
|
|
|
|
use App\Enums\CrawlOutcomeEnum;
|
|
use App\Services\UrlService;
|
|
use App\ValueObjects\FetchResult;
|
|
use fivefilters\Readability\Configuration;
|
|
use fivefilters\Readability\Readability;
|
|
use GuzzleHttp\Exception\ConnectException;
|
|
use Illuminate\Http\Client\ConnectionException;
|
|
use Illuminate\Http\Client\Factory;
|
|
use Illuminate\Http\Client\Response;
|
|
use InvalidArgumentException;
|
|
use League\Uri\BaseUri;
|
|
use Symfony\Component\DomCrawler\Crawler;
|
|
use Throwable;
|
|
|
|
class FetchPageAction
|
|
{
|
|
public function __construct(
|
|
private Factory $http,
|
|
) {}
|
|
|
|
public function __invoke(string $url): FetchResult
|
|
{
|
|
try {
|
|
$response = $this->http
|
|
->timeout(config('crawler.timeout'))
|
|
->withHeaders([
|
|
'User-Agent' => config('crawler.user_agent'),
|
|
'Accept' => 'text/html',
|
|
])
|
|
->withOptions([
|
|
'allow_redirects' => ['max' => config('crawler.max_redirects')],
|
|
])
|
|
->get($url);
|
|
|
|
} catch (ConnectionException|ConnectException $e) {
|
|
return $this->failureResult($e);
|
|
}
|
|
|
|
[$outcome, $error] = $this->validateResponse($response);
|
|
|
|
if ($outcome === CrawlOutcomeEnum::Success) {
|
|
[$title, $extractedText, $links] = $this->extractTitleTextAndLinks($response->body(), $url);
|
|
$wordCount = count(preg_split('/\s+/u', trim($extractedText)));
|
|
}
|
|
|
|
return new FetchResult(
|
|
outcome: $outcome,
|
|
statusCode: $response->status(),
|
|
finalUrl: $url,
|
|
title: $title ?? null,
|
|
extractedText: $extractedText ?? null,
|
|
outboundLinks: $links ?? collect(),
|
|
wordCount: $wordCount ?? null,
|
|
errorMessage: $error ?? null,
|
|
);
|
|
}
|
|
|
|
private function validateResponse(Response $response): array
|
|
{
|
|
$status = $response->status();
|
|
$statusStart = substr((string) $status, 0, 1);
|
|
|
|
if ($statusStart === '4') {
|
|
return [CrawlOutcomeEnum::Blocked4xx, "HTTP {$status}"];
|
|
}
|
|
|
|
if (str_starts_with((string) $status, '5')) {
|
|
return [CrawlOutcomeEnum::Blocked5xx, "HTTP {$status}"];
|
|
}
|
|
|
|
$contentType = $response->header('Content-Type');
|
|
if (! str_starts_with($contentType, 'text/html')) {
|
|
return [CrawlOutcomeEnum::Rejected, "Unsupported file type: {$contentType}"];
|
|
}
|
|
|
|
return [CrawlOutcomeEnum::Success, null];
|
|
}
|
|
|
|
private function failureResult(ConnectionException|ConnectException $e): FetchResult
|
|
{
|
|
$guzzleException = $e instanceof ConnectException
|
|
? $e
|
|
: ($e->getPrevious() instanceof ConnectException
|
|
? $e->getPrevious()
|
|
: null);
|
|
|
|
$errno = $guzzleException?->getHandlerContext()['errno'] ?? null;
|
|
|
|
$outcome = $errno === CURLE_OPERATION_TIMEDOUT
|
|
? CrawlOutcomeEnum::Timeout
|
|
: CrawlOutcomeEnum::Failed;
|
|
|
|
return new FetchResult(
|
|
outcome: $outcome,
|
|
statusCode: null,
|
|
finalUrl: null,
|
|
title: null,
|
|
extractedText: null,
|
|
outboundLinks: collect(),
|
|
wordCount: null,
|
|
errorMessage: $e->getMessage(),
|
|
);
|
|
}
|
|
|
|
private function extractTitleTextAndLinks(string $body, string $url): array
|
|
{
|
|
$crawler = new Crawler($body);
|
|
|
|
$title = $crawler->filter('title')->count() > 0
|
|
? trim($crawler->filter('title')->text())
|
|
: null;
|
|
|
|
$readability = new Readability(new Configuration);
|
|
$readability->parse($body);
|
|
$mainContent = $readability->getContent() ?? '';
|
|
$extractedText = trim(strip_tags($mainContent));
|
|
|
|
$links = collect();
|
|
if ($mainContent !== '') {
|
|
$linkCrawler = new Crawler($mainContent);
|
|
if ($linkCrawler->filter('a[href]')->count() > 0) {
|
|
$links = collect($linkCrawler->filter('a[href]')->extract(['href']));
|
|
}
|
|
}
|
|
|
|
$linksResolved = $links
|
|
->map(fn (string $href) => $this->resolveAndValidateLink($href, $url))
|
|
->filter()
|
|
->unique()
|
|
->values();
|
|
|
|
return [$title, $extractedText, $linksResolved];
|
|
}
|
|
|
|
private function resolveAndValidateLink(string $href, string $finalUrl): ?string
|
|
{
|
|
try {
|
|
$resolved = (string) BaseUri::from($finalUrl)->resolve($href);
|
|
$resolved = strstr($resolved, '#', true) ?: $resolved;
|
|
} catch (Throwable) {
|
|
return null;
|
|
}
|
|
|
|
try {
|
|
app(UrlService::class)->host($resolved);
|
|
} catch (InvalidArgumentException) {
|
|
return null;
|
|
}
|
|
|
|
return $resolved;
|
|
}
|
|
}
|