9 - Add robots.txt handling with cache and politeness integration
This commit is contained in:
parent
264180cd36
commit
cda1414cd8
9 changed files with 361 additions and 6 deletions
|
|
@ -6,9 +6,11 @@
|
|||
|
||||
use App\Actions\FetchPageAction;
|
||||
use App\Actions\RegisterDiscoveredPageAction;
|
||||
use App\Enums\CrawlOutcomeEnum;
|
||||
use App\Enums\PageStatusEnum;
|
||||
use App\Models\PageCrawl;
|
||||
use App\Services\PolitenessService;
|
||||
use App\Services\RobotsService;
|
||||
use App\ValueObjects\FetchResult;
|
||||
use Illuminate\Contracts\Queue\ShouldQueue;
|
||||
use Illuminate\Foundation\Queue\Queueable;
|
||||
|
|
@ -24,6 +26,18 @@ public function __construct(
|
|||
|
||||
public function handle(): void
|
||||
{
|
||||
$robotsService = resolve(RobotsService::class);
|
||||
|
||||
if (! $robotsService->isAllowed($this->pageCrawl->page->url)) {
|
||||
$this->pageCrawl->update([
|
||||
'outcome' => CrawlOutcomeEnum::BlockedRobots,
|
||||
'completed_at' => now(),
|
||||
]);
|
||||
$this->pageCrawl->page->update(['status' => PageStatusEnum::Failed]);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
$fetcher = resolve(FetchPageAction::class);
|
||||
$register = resolve(RegisterDiscoveredPageAction::class);
|
||||
$politenessService = resolve(PolitenessService::class);
|
||||
|
|
|
|||
|
|
@ -8,12 +8,12 @@ class PolitenessService
|
|||
{
|
||||
public function minDelayFor(string $domain): int
|
||||
{
|
||||
$configValue = config('crawler.min_domain_delay_seconds');
|
||||
/** @var RobotsService $robotsService */
|
||||
$robotsService = resolve(RobotsService::class);
|
||||
$crawlDelay = $robotsService->crawlDelayFor($domain, config('crawler.user_agent'));
|
||||
|
||||
if ($configValue !== null) {
|
||||
return $configValue;
|
||||
}
|
||||
$configValue = config('crawler.min_domain_delay_seconds', 10);
|
||||
|
||||
return 10;
|
||||
return max($crawlDelay ?? 0, $configValue);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
60
app/Services/RobotsService.php
Normal file
60
app/Services/RobotsService.php
Normal file
|
|
@ -0,0 +1,60 @@
|
|||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace App\Services;
|
||||
|
||||
use Illuminate\Http\Client\ConnectionException;
|
||||
use Illuminate\Support\Facades\Cache;
|
||||
use Illuminate\Support\Facades\Http;
|
||||
use Spatie\Robots\RobotsTxt;
|
||||
|
||||
class RobotsService
|
||||
{
|
||||
public function __construct(
|
||||
private UrlService $urlService,
|
||||
) {}
|
||||
|
||||
public function isAllowed(string $url, ?string $userAgent = null): bool
|
||||
{
|
||||
$host = $this->urlService->host($url);
|
||||
$path = parse_url($url, PHP_URL_PATH) ?? '/';
|
||||
|
||||
$body = Cache::remember(
|
||||
"crawler:robots:{$host}",
|
||||
config('crawler.robots_cache_ttl_seconds'),
|
||||
function () use ($host) {
|
||||
try {
|
||||
$response = Http::get("https://{$host}/robots.txt");
|
||||
|
||||
return $response->successful() ? $response->body() : '';
|
||||
} catch (ConnectionException) {
|
||||
return '';
|
||||
}
|
||||
}
|
||||
);
|
||||
|
||||
return (new RobotsTxt($body))->allows($path, $userAgent);
|
||||
}
|
||||
|
||||
public function crawlDelayFor(string $host, string $userAgent): ?int
|
||||
{
|
||||
$body = Cache::remember(
|
||||
"crawler:robots:{$host}",
|
||||
config('crawler.robots_cache_ttl_seconds'),
|
||||
function () use ($host) {
|
||||
try {
|
||||
$response = Http::get("https://{$host}/robots.txt");
|
||||
|
||||
return $response->successful() ? $response->body() : '';
|
||||
} catch (ConnectionException) {
|
||||
return '';
|
||||
}
|
||||
}
|
||||
);
|
||||
|
||||
$delay = (new RobotsTxt($body))->crawlDelay($userAgent);
|
||||
|
||||
return $delay !== null ? (int) $delay : null;
|
||||
}
|
||||
}
|
||||
|
|
@ -21,6 +21,7 @@
|
|||
"laravel/tinker": "^3.0",
|
||||
"livewire/livewire": "^4.2",
|
||||
"lvl0/fedi-discover": "@dev",
|
||||
"spatie/robots-txt": "^2.5",
|
||||
"symfony/dom-crawler": "^7.4"
|
||||
},
|
||||
"require-dev": {
|
||||
|
|
|
|||
62
composer.lock
generated
62
composer.lock
generated
|
|
@ -4,7 +4,7 @@
|
|||
"Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies",
|
||||
"This file is @generated automatically"
|
||||
],
|
||||
"content-hash": "2c63ed546b17b144997244f805e8a94a",
|
||||
"content-hash": "707278fe3558199c1d07f11dba1d20ec",
|
||||
"packages": [
|
||||
{
|
||||
"name": "brick/math",
|
||||
|
|
@ -3549,6 +3549,66 @@
|
|||
},
|
||||
"time": "2025-12-14T04:43:48+00:00"
|
||||
},
|
||||
{
|
||||
"name": "spatie/robots-txt",
|
||||
"version": "2.5.4",
|
||||
"source": {
|
||||
"type": "git",
|
||||
"url": "https://github.com/spatie/robots-txt.git",
|
||||
"reference": "a8dd35d0a94e863f52509a366a634978e9c1db03"
|
||||
},
|
||||
"dist": {
|
||||
"type": "zip",
|
||||
"url": "https://api.github.com/repos/spatie/robots-txt/zipball/a8dd35d0a94e863f52509a366a634978e9c1db03",
|
||||
"reference": "a8dd35d0a94e863f52509a366a634978e9c1db03",
|
||||
"shasum": ""
|
||||
},
|
||||
"require": {
|
||||
"php": "^8.1"
|
||||
},
|
||||
"require-dev": {
|
||||
"phpunit/phpunit": "^11.5.2"
|
||||
},
|
||||
"type": "library",
|
||||
"autoload": {
|
||||
"psr-4": {
|
||||
"Spatie\\Robots\\": "src"
|
||||
}
|
||||
},
|
||||
"notification-url": "https://packagist.org/downloads/",
|
||||
"license": [
|
||||
"MIT"
|
||||
],
|
||||
"authors": [
|
||||
{
|
||||
"name": "Brent Roose",
|
||||
"email": "brent@spatie.be",
|
||||
"homepage": "https://spatie.be",
|
||||
"role": "Developer"
|
||||
}
|
||||
],
|
||||
"description": "Determine if a page may be crawled from robots.txt and robots meta tags",
|
||||
"homepage": "https://github.com/spatie/robots-txt",
|
||||
"keywords": [
|
||||
"robots-txt",
|
||||
"spatie"
|
||||
],
|
||||
"support": {
|
||||
"issues": "https://github.com/spatie/robots-txt/issues",
|
||||
"source": "https://github.com/spatie/robots-txt/tree/2.5.4"
|
||||
},
|
||||
"funding": [
|
||||
{
|
||||
"url": "https://spatie.be/open-source/support-us",
|
||||
"type": "custom"
|
||||
},
|
||||
{
|
||||
"url": "https://github.com/spatie",
|
||||
"type": "github"
|
||||
}
|
||||
],
|
||||
"time": "2026-02-25T07:59:20+00:00"
|
||||
},
|
||||
{
|
||||
"name": "symfony/clock",
|
||||
"version": "v7.4.8",
|
||||
|
|
|
|||
|
|
@ -43,4 +43,5 @@
|
|||
'user_agent' => env('CRAWLER_USER_AGENT', 'TroveBot/0.1 (+https://trove.lvl0.xyz/bot)'),
|
||||
|
||||
'min_domain_delay_seconds' => env('CRAWLER_MIN_DOMAIN_DELAY_SECONDS', 10),
|
||||
'robots_cache_ttl_seconds' => env('CRAWLER_ROBOTS_CACHE_TTL_SECONDS', 60 * 60 * 24),
|
||||
];
|
||||
|
|
|
|||
|
|
@ -15,6 +15,7 @@
|
|||
use Illuminate\Foundation\Testing\RefreshDatabase;
|
||||
use Illuminate\Support\Collection;
|
||||
use Illuminate\Support\Facades\Cache;
|
||||
use Illuminate\Support\Facades\Http;
|
||||
use Illuminate\Support\Facades\Queue;
|
||||
use Mockery;
|
||||
use Tests\TestCase;
|
||||
|
|
@ -343,6 +344,46 @@ public function test_handle_does_not_release_lock_after_completion(): void
|
|||
$this->assertFalse($result, 'Expected the domain lock to still be held after handle() completed, but it was free — the lock was released prematurely.');
|
||||
}
|
||||
|
||||
public function test_handle_writes_blocked_robots_when_disallowed(): void
|
||||
{
|
||||
Queue::fake();
|
||||
|
||||
Http::fake([
|
||||
'https://example.com/robots.txt' => Http::response(
|
||||
"User-agent: *\nDisallow: /",
|
||||
200,
|
||||
),
|
||||
]);
|
||||
|
||||
// FetchPageAction must never be called — the robots gate returns before the lock
|
||||
$fetcher = Mockery::mock(FetchPageAction::class);
|
||||
$fetcher->shouldNotReceive('__invoke');
|
||||
$this->app->instance(FetchPageAction::class, $fetcher);
|
||||
|
||||
$page = Page::factory()->createQuietly(['url' => 'https://example.com/private']);
|
||||
$crawl = PageCrawl::factory()->page($page)->createQuietly();
|
||||
|
||||
$domain = $crawl->domain;
|
||||
|
||||
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
|
||||
->handle();
|
||||
|
||||
// Outcome row must record BlockedRobots
|
||||
$this->assertDatabaseHas('page_crawls', [
|
||||
'id' => $crawl->id,
|
||||
'outcome' => CrawlOutcomeEnum::BlockedRobots->value,
|
||||
]);
|
||||
|
||||
// Page status must be Failed (BlockedRobots::toPageStatus() === Failed)
|
||||
$this->assertSame(PageStatusEnum::Failed, $page->fresh()->status);
|
||||
|
||||
// The politeness lock must still be acquirable — the gate returned before ever claiming it
|
||||
$this->assertTrue(
|
||||
Cache::lock("crawler:domain:{$domain}", 10)->get(),
|
||||
'Expected the politeness lock to be free (gate returned before acquiring it), but it was already held.',
|
||||
);
|
||||
}
|
||||
|
||||
public function test_handle_acquires_domain_lock_before_fetching(): void
|
||||
{
|
||||
Queue::fake();
|
||||
|
|
@ -367,6 +408,55 @@ public function test_handle_acquires_domain_lock_before_fetching(): void
|
|||
$this->assertSame(CrawlOutcomeEnum::Success, $crawl->fresh()->outcome);
|
||||
}
|
||||
|
||||
public function test_handle_proceeds_through_politeness_lock_when_robots_allow(): void
|
||||
{
|
||||
Queue::fake();
|
||||
|
||||
Http::fake([
|
||||
'https://example.com/robots.txt' => Http::response(
|
||||
"User-agent: *\nAllow: /",
|
||||
200,
|
||||
),
|
||||
]);
|
||||
|
||||
// FetchPageAction must be called exactly once — robots gate passed, fetch proceeds
|
||||
$fetcher = Mockery::mock(FetchPageAction::class);
|
||||
$fetcher->shouldReceive('__invoke')->once()->andReturn(new FetchResult(
|
||||
outcome: CrawlOutcomeEnum::Success,
|
||||
statusCode: 200,
|
||||
finalUrl: 'https://example.com/article',
|
||||
title: 'Hello',
|
||||
extractedText: 'hi',
|
||||
outboundLinks: collect(),
|
||||
wordCount: 1,
|
||||
errorMessage: null,
|
||||
));
|
||||
$this->app->instance(FetchPageAction::class, $fetcher);
|
||||
|
||||
$page = Page::factory()->createQuietly(['url' => 'https://example.com/article']);
|
||||
$crawl = PageCrawl::factory()->page($page)->createQuietly();
|
||||
|
||||
$domain = $crawl->domain;
|
||||
|
||||
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
|
||||
->handle();
|
||||
|
||||
// Outcome must be Success — not BlockedRobots
|
||||
$this->assertDatabaseHas('page_crawls', [
|
||||
'id' => $crawl->id,
|
||||
'outcome' => CrawlOutcomeEnum::Success->value,
|
||||
]);
|
||||
|
||||
// Page status must have advanced to Fetched
|
||||
$this->assertSame(PageStatusEnum::Fetched, $page->fresh()->status);
|
||||
|
||||
// Politeness lock must still be held (claimed during the fetch, never released)
|
||||
$this->assertFalse(
|
||||
Cache::lock("crawler:domain:{$domain}", 10)->get(),
|
||||
'Expected the politeness lock to be held after a successful fetch, but it was free.',
|
||||
);
|
||||
}
|
||||
|
||||
private function mockFetchPageAction(
|
||||
CrawlOutcomeEnum $outcome,
|
||||
?int $statusCode = null,
|
||||
|
|
|
|||
|
|
@ -5,6 +5,7 @@
|
|||
namespace Tests\Unit\Services;
|
||||
|
||||
use App\Services\PolitenessService;
|
||||
use Illuminate\Support\Facades\Http;
|
||||
use Tests\TestCase;
|
||||
|
||||
class PolitenessServiceTest extends TestCase
|
||||
|
|
@ -20,4 +21,36 @@ public function test_min_delay_for_respects_config_override(): void
|
|||
|
||||
$this->assertSame(30, (new PolitenessService)->minDelayFor('example.com'));
|
||||
}
|
||||
|
||||
public function test_min_delay_for_uses_robots_crawl_delay_when_higher(): void
|
||||
{
|
||||
Http::fake([
|
||||
'https://example.com/robots.txt' => Http::response(
|
||||
// Spatie does exact-token matching (lowercased), so the fixture UA
|
||||
// must match the full string the service passes to crawlDelayFor().
|
||||
"User-agent: TroveBot/0.1 (+https://trove.lvl0.xyz/bot)\nCrawl-delay: 30",
|
||||
200,
|
||||
),
|
||||
]);
|
||||
|
||||
config()->set('crawler.min_domain_delay_seconds', 10);
|
||||
config()->set('crawler.user_agent', 'TroveBot/0.1 (+https://trove.lvl0.xyz/bot)');
|
||||
|
||||
$this->assertSame(30, app(PolitenessService::class)->minDelayFor('example.com'));
|
||||
}
|
||||
|
||||
public function test_min_delay_for_uses_config_when_higher_than_robots(): void
|
||||
{
|
||||
Http::fake([
|
||||
'https://example.com/robots.txt' => Http::response(
|
||||
"User-agent: TroveBot/0.1 (+https://trove.lvl0.xyz/bot)\nCrawl-delay: 10",
|
||||
200,
|
||||
),
|
||||
]);
|
||||
|
||||
config()->set('crawler.min_domain_delay_seconds', 60);
|
||||
config()->set('crawler.user_agent', 'TroveBot/0.1 (+https://trove.lvl0.xyz/bot)');
|
||||
|
||||
$this->assertSame(60, app(PolitenessService::class)->minDelayFor('example.com'));
|
||||
}
|
||||
}
|
||||
|
|
|
|||
96
tests/Unit/Services/RobotsServiceTest.php
Normal file
96
tests/Unit/Services/RobotsServiceTest.php
Normal file
|
|
@ -0,0 +1,96 @@
|
|||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace Tests\Unit\Services;
|
||||
|
||||
use App\Services\RobotsService;
|
||||
use Illuminate\Support\Facades\Http;
|
||||
use Tests\TestCase;
|
||||
|
||||
class RobotsServiceTest extends TestCase
|
||||
{
|
||||
public function test_is_allowed_returns_true_when_robots_txt_allows_path(): void
|
||||
{
|
||||
Http::fake([
|
||||
'https://example.com/robots.txt' => Http::response(
|
||||
"User-agent: *\nAllow: /",
|
||||
200,
|
||||
),
|
||||
]);
|
||||
|
||||
$service = app(RobotsService::class);
|
||||
|
||||
$this->assertTrue($service->isAllowed('https://example.com/article', 'TroveBot/0.1'));
|
||||
}
|
||||
|
||||
public function test_is_allowed_returns_false_when_robots_txt_disallows_path(): void
|
||||
{
|
||||
Http::fake([
|
||||
'https://example.com/robots.txt' => Http::response(
|
||||
"User-agent: *\nDisallow: /",
|
||||
200,
|
||||
),
|
||||
]);
|
||||
|
||||
$service = app(RobotsService::class);
|
||||
|
||||
$this->assertFalse($service->isAllowed('https://example.com/article', 'TroveBot/0.1'));
|
||||
}
|
||||
|
||||
public function test_is_allowed_returns_true_when_robots_txt_fetch_fails(): void
|
||||
{
|
||||
Http::fake([
|
||||
'https://example.com/robots.txt' => Http::response('', 500),
|
||||
]);
|
||||
|
||||
$service = app(RobotsService::class);
|
||||
|
||||
$this->assertTrue($service->isAllowed('https://example.com/article', 'TroveBot/0.1'));
|
||||
}
|
||||
|
||||
public function test_is_allowed_caches_robots_txt_body_per_host(): void
|
||||
{
|
||||
Http::fake([
|
||||
'https://example.com/robots.txt' => Http::response(
|
||||
"User-agent: *\nAllow: /",
|
||||
200,
|
||||
),
|
||||
]);
|
||||
|
||||
$service = app(RobotsService::class);
|
||||
|
||||
$service->isAllowed('https://example.com/article', 'TroveBot/0.1');
|
||||
$service->isAllowed('https://example.com/another-article', 'TroveBot/0.1');
|
||||
|
||||
Http::assertSentCount(1);
|
||||
}
|
||||
|
||||
public function test_crawl_delay_for_returns_parsed_value(): void
|
||||
{
|
||||
Http::fake([
|
||||
'https://example.com/robots.txt' => Http::response(
|
||||
"User-agent: TroveBot/0.1\nCrawl-delay: 30",
|
||||
200,
|
||||
),
|
||||
]);
|
||||
|
||||
$service = app(RobotsService::class);
|
||||
|
||||
$this->assertSame(30, $service->crawlDelayFor('example.com', 'TroveBot/0.1'));
|
||||
}
|
||||
|
||||
public function test_crawl_delay_for_returns_null_when_absent(): void
|
||||
{
|
||||
Http::fake([
|
||||
'https://example.com/robots.txt' => Http::response(
|
||||
"User-agent: *\nDisallow: /private",
|
||||
200,
|
||||
),
|
||||
]);
|
||||
|
||||
$service = app(RobotsService::class);
|
||||
|
||||
$this->assertNull($service->crawlDelayFor('example.com', 'TroveBot/0.1'));
|
||||
}
|
||||
}
|
||||
Loading…
Reference in a new issue