trove/app/Services/RobotsService.php

61 lines
1.7 KiB
PHP
Raw Normal View History

<?php
declare(strict_types=1);
namespace App\Services;
use Illuminate\Http\Client\ConnectionException;
use Illuminate\Support\Facades\Cache;
use Illuminate\Support\Facades\Http;
use Spatie\Robots\RobotsTxt;
class RobotsService
{
public function __construct(
private UrlService $urlService,
) {}
public function isAllowed(string $url, ?string $userAgent = null): bool
{
$host = $this->urlService->host($url);
$path = parse_url($url, PHP_URL_PATH) ?? '/';
$body = Cache::remember(
"crawler:robots:{$host}",
config('crawler.robots_cache_ttl_seconds'),
function () use ($host) {
try {
$response = Http::get("https://{$host}/robots.txt");
return $response->successful() ? $response->body() : '';
} catch (ConnectionException) {
return '';
}
}
);
return (new RobotsTxt($body))->allows($path, $userAgent);
}
public function crawlDelayFor(string $host, string $userAgent): ?int
{
$body = Cache::remember(
"crawler:robots:{$host}",
config('crawler.robots_cache_ttl_seconds'),
function () use ($host) {
try {
$response = Http::get("https://{$host}/robots.txt");
return $response->successful() ? $response->body() : '';
} catch (ConnectionException) {
return '';
}
}
);
$delay = (new RobotsTxt($body))->crawlDelay($userAgent);
return $delay !== null ? (int) $delay : null;
}
}