60 lines
1.7 KiB
PHP
60 lines
1.7 KiB
PHP
<?php
|
|
|
|
declare(strict_types=1);
|
|
|
|
namespace App\Services;
|
|
|
|
use Illuminate\Http\Client\ConnectionException;
|
|
use Illuminate\Support\Facades\Cache;
|
|
use Illuminate\Support\Facades\Http;
|
|
use Spatie\Robots\RobotsTxt;
|
|
|
|
class RobotsService
|
|
{
|
|
public function __construct(
|
|
private UrlService $urlService,
|
|
) {}
|
|
|
|
public function isAllowed(string $url, ?string $userAgent = null): bool
|
|
{
|
|
$host = $this->urlService->host($url);
|
|
$path = parse_url($url, PHP_URL_PATH) ?? '/';
|
|
|
|
$body = Cache::remember(
|
|
"crawler:robots:{$host}",
|
|
config('crawler.robots_cache_ttl_seconds'),
|
|
function () use ($host) {
|
|
try {
|
|
$response = Http::get("https://{$host}/robots.txt");
|
|
|
|
return $response->successful() ? $response->body() : '';
|
|
} catch (ConnectionException) {
|
|
return '';
|
|
}
|
|
}
|
|
);
|
|
|
|
return (new RobotsTxt($body))->allows($path, $userAgent);
|
|
}
|
|
|
|
public function crawlDelayFor(string $host, string $userAgent): ?int
|
|
{
|
|
$body = Cache::remember(
|
|
"crawler:robots:{$host}",
|
|
config('crawler.robots_cache_ttl_seconds'),
|
|
function () use ($host) {
|
|
try {
|
|
$response = Http::get("https://{$host}/robots.txt");
|
|
|
|
return $response->successful() ? $response->body() : '';
|
|
} catch (ConnectionException) {
|
|
return '';
|
|
}
|
|
}
|
|
);
|
|
|
|
$delay = (new RobotsTxt($body))->crawlDelay($userAgent);
|
|
|
|
return $delay !== null ? (int) $delay : null;
|
|
}
|
|
}
|