9 - Add robots.txt handling with cache and politeness integration
Some checks failed
CI / ci (push) Failing after 3h0m1s
CI / ci (pull_request) Has been cancelled

This commit is contained in:
myrmidex 2026-04-27 23:53:52 +02:00
parent 264180cd36
commit cda1414cd8
9 changed files with 361 additions and 6 deletions

View file

@ -6,9 +6,11 @@
use App\Actions\FetchPageAction; use App\Actions\FetchPageAction;
use App\Actions\RegisterDiscoveredPageAction; use App\Actions\RegisterDiscoveredPageAction;
use App\Enums\CrawlOutcomeEnum;
use App\Enums\PageStatusEnum; use App\Enums\PageStatusEnum;
use App\Models\PageCrawl; use App\Models\PageCrawl;
use App\Services\PolitenessService; use App\Services\PolitenessService;
use App\Services\RobotsService;
use App\ValueObjects\FetchResult; use App\ValueObjects\FetchResult;
use Illuminate\Contracts\Queue\ShouldQueue; use Illuminate\Contracts\Queue\ShouldQueue;
use Illuminate\Foundation\Queue\Queueable; use Illuminate\Foundation\Queue\Queueable;
@ -24,6 +26,18 @@ public function __construct(
public function handle(): void public function handle(): void
{ {
$robotsService = resolve(RobotsService::class);
if (! $robotsService->isAllowed($this->pageCrawl->page->url)) {
$this->pageCrawl->update([
'outcome' => CrawlOutcomeEnum::BlockedRobots,
'completed_at' => now(),
]);
$this->pageCrawl->page->update(['status' => PageStatusEnum::Failed]);
return;
}
$fetcher = resolve(FetchPageAction::class); $fetcher = resolve(FetchPageAction::class);
$register = resolve(RegisterDiscoveredPageAction::class); $register = resolve(RegisterDiscoveredPageAction::class);
$politenessService = resolve(PolitenessService::class); $politenessService = resolve(PolitenessService::class);

View file

@ -8,12 +8,12 @@ class PolitenessService
{ {
public function minDelayFor(string $domain): int public function minDelayFor(string $domain): int
{ {
$configValue = config('crawler.min_domain_delay_seconds'); /** @var RobotsService $robotsService */
$robotsService = resolve(RobotsService::class);
$crawlDelay = $robotsService->crawlDelayFor($domain, config('crawler.user_agent'));
if ($configValue !== null) { $configValue = config('crawler.min_domain_delay_seconds', 10);
return $configValue;
}
return 10; return max($crawlDelay ?? 0, $configValue);
} }
} }

View file

@ -0,0 +1,60 @@
<?php
declare(strict_types=1);
namespace App\Services;
use Illuminate\Http\Client\ConnectionException;
use Illuminate\Support\Facades\Cache;
use Illuminate\Support\Facades\Http;
use Spatie\Robots\RobotsTxt;
class RobotsService
{
public function __construct(
private UrlService $urlService,
) {}
public function isAllowed(string $url, ?string $userAgent = null): bool
{
$host = $this->urlService->host($url);
$path = parse_url($url, PHP_URL_PATH) ?? '/';
$body = Cache::remember(
"crawler:robots:{$host}",
config('crawler.robots_cache_ttl_seconds'),
function () use ($host) {
try {
$response = Http::get("https://{$host}/robots.txt");
return $response->successful() ? $response->body() : '';
} catch (ConnectionException) {
return '';
}
}
);
return (new RobotsTxt($body))->allows($path, $userAgent);
}
public function crawlDelayFor(string $host, string $userAgent): ?int
{
$body = Cache::remember(
"crawler:robots:{$host}",
config('crawler.robots_cache_ttl_seconds'),
function () use ($host) {
try {
$response = Http::get("https://{$host}/robots.txt");
return $response->successful() ? $response->body() : '';
} catch (ConnectionException) {
return '';
}
}
);
$delay = (new RobotsTxt($body))->crawlDelay($userAgent);
return $delay !== null ? (int) $delay : null;
}
}

View file

@ -21,6 +21,7 @@
"laravel/tinker": "^3.0", "laravel/tinker": "^3.0",
"livewire/livewire": "^4.2", "livewire/livewire": "^4.2",
"lvl0/fedi-discover": "@dev", "lvl0/fedi-discover": "@dev",
"spatie/robots-txt": "^2.5",
"symfony/dom-crawler": "^7.4" "symfony/dom-crawler": "^7.4"
}, },
"require-dev": { "require-dev": {

62
composer.lock generated
View file

@ -4,7 +4,7 @@
"Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies", "Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies",
"This file is @generated automatically" "This file is @generated automatically"
], ],
"content-hash": "2c63ed546b17b144997244f805e8a94a", "content-hash": "707278fe3558199c1d07f11dba1d20ec",
"packages": [ "packages": [
{ {
"name": "brick/math", "name": "brick/math",
@ -3549,6 +3549,66 @@
}, },
"time": "2025-12-14T04:43:48+00:00" "time": "2025-12-14T04:43:48+00:00"
}, },
{
"name": "spatie/robots-txt",
"version": "2.5.4",
"source": {
"type": "git",
"url": "https://github.com/spatie/robots-txt.git",
"reference": "a8dd35d0a94e863f52509a366a634978e9c1db03"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/spatie/robots-txt/zipball/a8dd35d0a94e863f52509a366a634978e9c1db03",
"reference": "a8dd35d0a94e863f52509a366a634978e9c1db03",
"shasum": ""
},
"require": {
"php": "^8.1"
},
"require-dev": {
"phpunit/phpunit": "^11.5.2"
},
"type": "library",
"autoload": {
"psr-4": {
"Spatie\\Robots\\": "src"
}
},
"notification-url": "https://packagist.org/downloads/",
"license": [
"MIT"
],
"authors": [
{
"name": "Brent Roose",
"email": "brent@spatie.be",
"homepage": "https://spatie.be",
"role": "Developer"
}
],
"description": "Determine if a page may be crawled from robots.txt and robots meta tags",
"homepage": "https://github.com/spatie/robots-txt",
"keywords": [
"robots-txt",
"spatie"
],
"support": {
"issues": "https://github.com/spatie/robots-txt/issues",
"source": "https://github.com/spatie/robots-txt/tree/2.5.4"
},
"funding": [
{
"url": "https://spatie.be/open-source/support-us",
"type": "custom"
},
{
"url": "https://github.com/spatie",
"type": "github"
}
],
"time": "2026-02-25T07:59:20+00:00"
},
{ {
"name": "symfony/clock", "name": "symfony/clock",
"version": "v7.4.8", "version": "v7.4.8",

View file

@ -43,4 +43,5 @@
'user_agent' => env('CRAWLER_USER_AGENT', 'TroveBot/0.1 (+https://trove.lvl0.xyz/bot)'), 'user_agent' => env('CRAWLER_USER_AGENT', 'TroveBot/0.1 (+https://trove.lvl0.xyz/bot)'),
'min_domain_delay_seconds' => env('CRAWLER_MIN_DOMAIN_DELAY_SECONDS', 10), 'min_domain_delay_seconds' => env('CRAWLER_MIN_DOMAIN_DELAY_SECONDS', 10),
'robots_cache_ttl_seconds' => env('CRAWLER_ROBOTS_CACHE_TTL_SECONDS', 60 * 60 * 24),
]; ];

View file

@ -15,6 +15,7 @@
use Illuminate\Foundation\Testing\RefreshDatabase; use Illuminate\Foundation\Testing\RefreshDatabase;
use Illuminate\Support\Collection; use Illuminate\Support\Collection;
use Illuminate\Support\Facades\Cache; use Illuminate\Support\Facades\Cache;
use Illuminate\Support\Facades\Http;
use Illuminate\Support\Facades\Queue; use Illuminate\Support\Facades\Queue;
use Mockery; use Mockery;
use Tests\TestCase; use Tests\TestCase;
@ -343,6 +344,46 @@ public function test_handle_does_not_release_lock_after_completion(): void
$this->assertFalse($result, 'Expected the domain lock to still be held after handle() completed, but it was free — the lock was released prematurely.'); $this->assertFalse($result, 'Expected the domain lock to still be held after handle() completed, but it was free — the lock was released prematurely.');
} }
public function test_handle_writes_blocked_robots_when_disallowed(): void
{
Queue::fake();
Http::fake([
'https://example.com/robots.txt' => Http::response(
"User-agent: *\nDisallow: /",
200,
),
]);
// FetchPageAction must never be called — the robots gate returns before the lock
$fetcher = Mockery::mock(FetchPageAction::class);
$fetcher->shouldNotReceive('__invoke');
$this->app->instance(FetchPageAction::class, $fetcher);
$page = Page::factory()->createQuietly(['url' => 'https://example.com/private']);
$crawl = PageCrawl::factory()->page($page)->createQuietly();
$domain = $crawl->domain;
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
->handle();
// Outcome row must record BlockedRobots
$this->assertDatabaseHas('page_crawls', [
'id' => $crawl->id,
'outcome' => CrawlOutcomeEnum::BlockedRobots->value,
]);
// Page status must be Failed (BlockedRobots::toPageStatus() === Failed)
$this->assertSame(PageStatusEnum::Failed, $page->fresh()->status);
// The politeness lock must still be acquirable — the gate returned before ever claiming it
$this->assertTrue(
Cache::lock("crawler:domain:{$domain}", 10)->get(),
'Expected the politeness lock to be free (gate returned before acquiring it), but it was already held.',
);
}
public function test_handle_acquires_domain_lock_before_fetching(): void public function test_handle_acquires_domain_lock_before_fetching(): void
{ {
Queue::fake(); Queue::fake();
@ -367,6 +408,55 @@ public function test_handle_acquires_domain_lock_before_fetching(): void
$this->assertSame(CrawlOutcomeEnum::Success, $crawl->fresh()->outcome); $this->assertSame(CrawlOutcomeEnum::Success, $crawl->fresh()->outcome);
} }
public function test_handle_proceeds_through_politeness_lock_when_robots_allow(): void
{
Queue::fake();
Http::fake([
'https://example.com/robots.txt' => Http::response(
"User-agent: *\nAllow: /",
200,
),
]);
// FetchPageAction must be called exactly once — robots gate passed, fetch proceeds
$fetcher = Mockery::mock(FetchPageAction::class);
$fetcher->shouldReceive('__invoke')->once()->andReturn(new FetchResult(
outcome: CrawlOutcomeEnum::Success,
statusCode: 200,
finalUrl: 'https://example.com/article',
title: 'Hello',
extractedText: 'hi',
outboundLinks: collect(),
wordCount: 1,
errorMessage: null,
));
$this->app->instance(FetchPageAction::class, $fetcher);
$page = Page::factory()->createQuietly(['url' => 'https://example.com/article']);
$crawl = PageCrawl::factory()->page($page)->createQuietly();
$domain = $crawl->domain;
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
->handle();
// Outcome must be Success — not BlockedRobots
$this->assertDatabaseHas('page_crawls', [
'id' => $crawl->id,
'outcome' => CrawlOutcomeEnum::Success->value,
]);
// Page status must have advanced to Fetched
$this->assertSame(PageStatusEnum::Fetched, $page->fresh()->status);
// Politeness lock must still be held (claimed during the fetch, never released)
$this->assertFalse(
Cache::lock("crawler:domain:{$domain}", 10)->get(),
'Expected the politeness lock to be held after a successful fetch, but it was free.',
);
}
private function mockFetchPageAction( private function mockFetchPageAction(
CrawlOutcomeEnum $outcome, CrawlOutcomeEnum $outcome,
?int $statusCode = null, ?int $statusCode = null,

View file

@ -5,6 +5,7 @@
namespace Tests\Unit\Services; namespace Tests\Unit\Services;
use App\Services\PolitenessService; use App\Services\PolitenessService;
use Illuminate\Support\Facades\Http;
use Tests\TestCase; use Tests\TestCase;
class PolitenessServiceTest extends TestCase class PolitenessServiceTest extends TestCase
@ -20,4 +21,36 @@ public function test_min_delay_for_respects_config_override(): void
$this->assertSame(30, (new PolitenessService)->minDelayFor('example.com')); $this->assertSame(30, (new PolitenessService)->minDelayFor('example.com'));
} }
public function test_min_delay_for_uses_robots_crawl_delay_when_higher(): void
{
Http::fake([
'https://example.com/robots.txt' => Http::response(
// Spatie does exact-token matching (lowercased), so the fixture UA
// must match the full string the service passes to crawlDelayFor().
"User-agent: TroveBot/0.1 (+https://trove.lvl0.xyz/bot)\nCrawl-delay: 30",
200,
),
]);
config()->set('crawler.min_domain_delay_seconds', 10);
config()->set('crawler.user_agent', 'TroveBot/0.1 (+https://trove.lvl0.xyz/bot)');
$this->assertSame(30, app(PolitenessService::class)->minDelayFor('example.com'));
}
public function test_min_delay_for_uses_config_when_higher_than_robots(): void
{
Http::fake([
'https://example.com/robots.txt' => Http::response(
"User-agent: TroveBot/0.1 (+https://trove.lvl0.xyz/bot)\nCrawl-delay: 10",
200,
),
]);
config()->set('crawler.min_domain_delay_seconds', 60);
config()->set('crawler.user_agent', 'TroveBot/0.1 (+https://trove.lvl0.xyz/bot)');
$this->assertSame(60, app(PolitenessService::class)->minDelayFor('example.com'));
}
} }

View file

@ -0,0 +1,96 @@
<?php
declare(strict_types=1);
namespace Tests\Unit\Services;
use App\Services\RobotsService;
use Illuminate\Support\Facades\Http;
use Tests\TestCase;
class RobotsServiceTest extends TestCase
{
public function test_is_allowed_returns_true_when_robots_txt_allows_path(): void
{
Http::fake([
'https://example.com/robots.txt' => Http::response(
"User-agent: *\nAllow: /",
200,
),
]);
$service = app(RobotsService::class);
$this->assertTrue($service->isAllowed('https://example.com/article', 'TroveBot/0.1'));
}
public function test_is_allowed_returns_false_when_robots_txt_disallows_path(): void
{
Http::fake([
'https://example.com/robots.txt' => Http::response(
"User-agent: *\nDisallow: /",
200,
),
]);
$service = app(RobotsService::class);
$this->assertFalse($service->isAllowed('https://example.com/article', 'TroveBot/0.1'));
}
public function test_is_allowed_returns_true_when_robots_txt_fetch_fails(): void
{
Http::fake([
'https://example.com/robots.txt' => Http::response('', 500),
]);
$service = app(RobotsService::class);
$this->assertTrue($service->isAllowed('https://example.com/article', 'TroveBot/0.1'));
}
public function test_is_allowed_caches_robots_txt_body_per_host(): void
{
Http::fake([
'https://example.com/robots.txt' => Http::response(
"User-agent: *\nAllow: /",
200,
),
]);
$service = app(RobotsService::class);
$service->isAllowed('https://example.com/article', 'TroveBot/0.1');
$service->isAllowed('https://example.com/another-article', 'TroveBot/0.1');
Http::assertSentCount(1);
}
public function test_crawl_delay_for_returns_parsed_value(): void
{
Http::fake([
'https://example.com/robots.txt' => Http::response(
"User-agent: TroveBot/0.1\nCrawl-delay: 30",
200,
),
]);
$service = app(RobotsService::class);
$this->assertSame(30, $service->crawlDelayFor('example.com', 'TroveBot/0.1'));
}
public function test_crawl_delay_for_returns_null_when_absent(): void
{
Http::fake([
'https://example.com/robots.txt' => Http::response(
"User-agent: *\nDisallow: /private",
200,
),
]);
$service = app(RobotsService::class);
$this->assertNull($service->crawlDelayFor('example.com', 'TroveBot/0.1'));
}
}