9 - Add robots.txt handling with cache and politeness integration

2026-04-27 23:53:52 +02:00 · 2026-04-27 23:53:52 +02:00 · cda1414cd8
commit cda1414cd8
parent 264180cd36
9 changed files with 361 additions and 6 deletions
--- a/app/Jobs/ProcessCrawlJob.php
+++ b/app/Jobs/ProcessCrawlJob.php
@ -6,9 +6,11 @@

 use App\Actions\FetchPageAction;
 use App\Actions\RegisterDiscoveredPageAction;
+use App\Enums\CrawlOutcomeEnum;
 use App\Enums\PageStatusEnum;
 use App\Models\PageCrawl;
 use App\Services\PolitenessService;
+use App\Services\RobotsService;
 use App\ValueObjects\FetchResult;
 use Illuminate\Contracts\Queue\ShouldQueue;
 use Illuminate\Foundation\Queue\Queueable;
@ -24,6 +26,18 @@ public function __construct(

    public function handle(): void
    {
+        $robotsService = resolve(RobotsService::class);
+
+        if (! $robotsService->isAllowed($this->pageCrawl->page->url)) {
+            $this->pageCrawl->update([
+                'outcome' => CrawlOutcomeEnum::BlockedRobots,
+                'completed_at' => now(),
+            ]);
+            $this->pageCrawl->page->update(['status' => PageStatusEnum::Failed]);
+
+            return;
+        }
+
        $fetcher = resolve(FetchPageAction::class);
        $register = resolve(RegisterDiscoveredPageAction::class);
        $politenessService = resolve(PolitenessService::class);
--- a/app/Services/PolitenessService.php
+++ b/app/Services/PolitenessService.php
@ -8,12 +8,12 @@ class PolitenessService
 {
    public function minDelayFor(string $domain): int
    {
-        $configValue = config('crawler.min_domain_delay_seconds');
+        /** @var RobotsService $robotsService */
+        $robotsService = resolve(RobotsService::class);
+        $crawlDelay = $robotsService->crawlDelayFor($domain, config('crawler.user_agent'));

-        if ($configValue !== null) {
-            return $configValue;
-        }
+        $configValue = config('crawler.min_domain_delay_seconds', 10);

-        return 10;
+        return max($crawlDelay ?? 0, $configValue);
    }
 }
--- a/app/Services/RobotsService.php
+++ b/app/Services/RobotsService.php
@ -0,0 +1,60 @@
+<?php
+
+declare(strict_types=1);
+
+namespace App\Services;
+
+use Illuminate\Http\Client\ConnectionException;
+use Illuminate\Support\Facades\Cache;
+use Illuminate\Support\Facades\Http;
+use Spatie\Robots\RobotsTxt;
+
+class RobotsService
+{
+    public function __construct(
+        private UrlService $urlService,
+    ) {}
+
+    public function isAllowed(string $url, ?string $userAgent = null): bool
+    {
+        $host = $this->urlService->host($url);
+        $path = parse_url($url, PHP_URL_PATH) ?? '/';
+
+        $body = Cache::remember(
+            "crawler:robots:{$host}",
+            config('crawler.robots_cache_ttl_seconds'),
+            function () use ($host) {
+                try {
+                    $response = Http::get("https://{$host}/robots.txt");
+
+                    return $response->successful() ? $response->body() : '';
+                } catch (ConnectionException) {
+                    return '';
+                }
+            }
+        );
+
+        return (new RobotsTxt($body))->allows($path, $userAgent);
+    }
+
+    public function crawlDelayFor(string $host, string $userAgent): ?int
+    {
+        $body = Cache::remember(
+            "crawler:robots:{$host}",
+            config('crawler.robots_cache_ttl_seconds'),
+            function () use ($host) {
+                try {
+                    $response = Http::get("https://{$host}/robots.txt");
+
+                    return $response->successful() ? $response->body() : '';
+                } catch (ConnectionException) {
+                    return '';
+                }
+            }
+        );
+
+        $delay = (new RobotsTxt($body))->crawlDelay($userAgent);
+
+        return $delay !== null ? (int) $delay : null;
+    }
+}
--- a/composer.json
+++ b/composer.json
@ -21,6 +21,7 @@
        "laravel/tinker": "^3.0",
        "livewire/livewire": "^4.2",
        "lvl0/fedi-discover": "@dev",
+        "spatie/robots-txt": "^2.5",
        "symfony/dom-crawler": "^7.4"
    },
    "require-dev": {
--- a/composer.lock
+++ b/composer.lock
@ -4,7 +4,7 @@
        "Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies",
        "This file is @generated automatically"
    ],
-    "content-hash": "2c63ed546b17b144997244f805e8a94a",
+    "content-hash": "707278fe3558199c1d07f11dba1d20ec",
    "packages": [
        {
            "name": "brick/math",
@ -3549,6 +3549,66 @@
            },
            "time": "2025-12-14T04:43:48+00:00"
        },
+        {
+            "name": "spatie/robots-txt",
+            "version": "2.5.4",
+            "source": {
+                "type": "git",
+                "url": "https://github.com/spatie/robots-txt.git",
+                "reference": "a8dd35d0a94e863f52509a366a634978e9c1db03"
+            },
+            "dist": {
+                "type": "zip",
+                "url": "https://api.github.com/repos/spatie/robots-txt/zipball/a8dd35d0a94e863f52509a366a634978e9c1db03",
+                "reference": "a8dd35d0a94e863f52509a366a634978e9c1db03",
+                "shasum": ""
+            },
+            "require": {
+                "php": "^8.1"
+            },
+            "require-dev": {
+                "phpunit/phpunit": "^11.5.2"
+            },
+            "type": "library",
+            "autoload": {
+                "psr-4": {
+                    "Spatie\\Robots\\": "src"
+                }
+            },
+            "notification-url": "https://packagist.org/downloads/",
+            "license": [
+                "MIT"
+            ],
+            "authors": [
+                {
+                    "name": "Brent Roose",
+                    "email": "brent@spatie.be",
+                    "homepage": "https://spatie.be",
+                    "role": "Developer"
+                }
+            ],
+            "description": "Determine if a page may be crawled from robots.txt and robots meta tags",
+            "homepage": "https://github.com/spatie/robots-txt",
+            "keywords": [
+                "robots-txt",
+                "spatie"
+            ],
+            "support": {
+                "issues": "https://github.com/spatie/robots-txt/issues",
+                "source": "https://github.com/spatie/robots-txt/tree/2.5.4"
+            },
+            "funding": [
+                {
+                    "url": "https://spatie.be/open-source/support-us",
+                    "type": "custom"
+                },
+                {
+                    "url": "https://github.com/spatie",
+                    "type": "github"
+                }
+            ],
+            "time": "2026-02-25T07:59:20+00:00"
+        },
        {
            "name": "symfony/clock",
            "version": "v7.4.8",
--- a/config/crawler.php
+++ b/config/crawler.php
@ -43,4 +43,5 @@
    'user_agent' => env('CRAWLER_USER_AGENT', 'TroveBot/0.1 (+https://trove.lvl0.xyz/bot)'),

    'min_domain_delay_seconds' => env('CRAWLER_MIN_DOMAIN_DELAY_SECONDS', 10),
+    'robots_cache_ttl_seconds' => env('CRAWLER_ROBOTS_CACHE_TTL_SECONDS', 60 * 60 * 24),
 ];
--- a/tests/Feature/Jobs/ProcessCrawlJobTest.php
+++ b/tests/Feature/Jobs/ProcessCrawlJobTest.php
@ -15,6 +15,7 @@
 use Illuminate\Foundation\Testing\RefreshDatabase;
 use Illuminate\Support\Collection;
 use Illuminate\Support\Facades\Cache;
+use Illuminate\Support\Facades\Http;
 use Illuminate\Support\Facades\Queue;
 use Mockery;
 use Tests\TestCase;
@ -343,6 +344,46 @@ public function test_handle_does_not_release_lock_after_completion(): void
        $this->assertFalse($result, 'Expected the domain lock to still be held after handle() completed, but it was free — the lock was released prematurely.');
    }

+    public function test_handle_writes_blocked_robots_when_disallowed(): void
+    {
+        Queue::fake();
+
+        Http::fake([
+            'https://example.com/robots.txt' => Http::response(
+                "User-agent: *\nDisallow: /",
+                200,
+            ),
+        ]);
+
+        // FetchPageAction must never be called — the robots gate returns before the lock
+        $fetcher = Mockery::mock(FetchPageAction::class);
+        $fetcher->shouldNotReceive('__invoke');
+        $this->app->instance(FetchPageAction::class, $fetcher);
+
+        $page = Page::factory()->createQuietly(['url' => 'https://example.com/private']);
+        $crawl = PageCrawl::factory()->page($page)->createQuietly();
+
+        $domain = $crawl->domain;
+
+        app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
+            ->handle();
+
+        // Outcome row must record BlockedRobots
+        $this->assertDatabaseHas('page_crawls', [
+            'id' => $crawl->id,
+            'outcome' => CrawlOutcomeEnum::BlockedRobots->value,
+        ]);
+
+        // Page status must be Failed (BlockedRobots::toPageStatus() === Failed)
+        $this->assertSame(PageStatusEnum::Failed, $page->fresh()->status);
+
+        // The politeness lock must still be acquirable — the gate returned before ever claiming it
+        $this->assertTrue(
+            Cache::lock("crawler:domain:{$domain}", 10)->get(),
+            'Expected the politeness lock to be free (gate returned before acquiring it), but it was already held.',
+        );
+    }
+
    public function test_handle_acquires_domain_lock_before_fetching(): void
    {
        Queue::fake();
@ -367,6 +408,55 @@ public function test_handle_acquires_domain_lock_before_fetching(): void
        $this->assertSame(CrawlOutcomeEnum::Success, $crawl->fresh()->outcome);
    }

+    public function test_handle_proceeds_through_politeness_lock_when_robots_allow(): void
+    {
+        Queue::fake();
+
+        Http::fake([
+            'https://example.com/robots.txt' => Http::response(
+                "User-agent: *\nAllow: /",
+                200,
+            ),
+        ]);
+
+        // FetchPageAction must be called exactly once — robots gate passed, fetch proceeds
+        $fetcher = Mockery::mock(FetchPageAction::class);
+        $fetcher->shouldReceive('__invoke')->once()->andReturn(new FetchResult(
+            outcome: CrawlOutcomeEnum::Success,
+            statusCode: 200,
+            finalUrl: 'https://example.com/article',
+            title: 'Hello',
+            extractedText: 'hi',
+            outboundLinks: collect(),
+            wordCount: 1,
+            errorMessage: null,
+        ));
+        $this->app->instance(FetchPageAction::class, $fetcher);
+
+        $page = Page::factory()->createQuietly(['url' => 'https://example.com/article']);
+        $crawl = PageCrawl::factory()->page($page)->createQuietly();
+
+        $domain = $crawl->domain;
+
+        app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
+            ->handle();
+
+        // Outcome must be Success — not BlockedRobots
+        $this->assertDatabaseHas('page_crawls', [
+            'id' => $crawl->id,
+            'outcome' => CrawlOutcomeEnum::Success->value,
+        ]);
+
+        // Page status must have advanced to Fetched
+        $this->assertSame(PageStatusEnum::Fetched, $page->fresh()->status);
+
+        // Politeness lock must still be held (claimed during the fetch, never released)
+        $this->assertFalse(
+            Cache::lock("crawler:domain:{$domain}", 10)->get(),
+            'Expected the politeness lock to be held after a successful fetch, but it was free.',
+        );
+    }
+
    private function mockFetchPageAction(
        CrawlOutcomeEnum $outcome,
        ?int $statusCode = null,
--- a/tests/Unit/Services/PolitenessServiceTest.php
+++ b/tests/Unit/Services/PolitenessServiceTest.php
@ -5,6 +5,7 @@
 namespace Tests\Unit\Services;

 use App\Services\PolitenessService;
+use Illuminate\Support\Facades\Http;
 use Tests\TestCase;

 class PolitenessServiceTest extends TestCase
@ -20,4 +21,36 @@ public function test_min_delay_for_respects_config_override(): void

        $this->assertSame(30, (new PolitenessService)->minDelayFor('example.com'));
    }
+
+    public function test_min_delay_for_uses_robots_crawl_delay_when_higher(): void
+    {
+        Http::fake([
+            'https://example.com/robots.txt' => Http::response(
+                // Spatie does exact-token matching (lowercased), so the fixture UA
+                // must match the full string the service passes to crawlDelayFor().
+                "User-agent: TroveBot/0.1 (+https://trove.lvl0.xyz/bot)\nCrawl-delay: 30",
+                200,
+            ),
+        ]);
+
+        config()->set('crawler.min_domain_delay_seconds', 10);
+        config()->set('crawler.user_agent', 'TroveBot/0.1 (+https://trove.lvl0.xyz/bot)');
+
+        $this->assertSame(30, app(PolitenessService::class)->minDelayFor('example.com'));
+    }
+
+    public function test_min_delay_for_uses_config_when_higher_than_robots(): void
+    {
+        Http::fake([
+            'https://example.com/robots.txt' => Http::response(
+                "User-agent: TroveBot/0.1 (+https://trove.lvl0.xyz/bot)\nCrawl-delay: 10",
+                200,
+            ),
+        ]);
+
+        config()->set('crawler.min_domain_delay_seconds', 60);
+        config()->set('crawler.user_agent', 'TroveBot/0.1 (+https://trove.lvl0.xyz/bot)');
+
+        $this->assertSame(60, app(PolitenessService::class)->minDelayFor('example.com'));
+    }
 }
--- a/tests/Unit/Services/RobotsServiceTest.php
+++ b/tests/Unit/Services/RobotsServiceTest.php
@ -0,0 +1,96 @@
+<?php
+
+declare(strict_types=1);
+
+namespace Tests\Unit\Services;
+
+use App\Services\RobotsService;
+use Illuminate\Support\Facades\Http;
+use Tests\TestCase;
+
+class RobotsServiceTest extends TestCase
+{
+    public function test_is_allowed_returns_true_when_robots_txt_allows_path(): void
+    {
+        Http::fake([
+            'https://example.com/robots.txt' => Http::response(
+                "User-agent: *\nAllow: /",
+                200,
+            ),
+        ]);
+
+        $service = app(RobotsService::class);
+
+        $this->assertTrue($service->isAllowed('https://example.com/article', 'TroveBot/0.1'));
+    }
+
+    public function test_is_allowed_returns_false_when_robots_txt_disallows_path(): void
+    {
+        Http::fake([
+            'https://example.com/robots.txt' => Http::response(
+                "User-agent: *\nDisallow: /",
+                200,
+            ),
+        ]);
+
+        $service = app(RobotsService::class);
+
+        $this->assertFalse($service->isAllowed('https://example.com/article', 'TroveBot/0.1'));
+    }
+
+    public function test_is_allowed_returns_true_when_robots_txt_fetch_fails(): void
+    {
+        Http::fake([
+            'https://example.com/robots.txt' => Http::response('', 500),
+        ]);
+
+        $service = app(RobotsService::class);
+
+        $this->assertTrue($service->isAllowed('https://example.com/article', 'TroveBot/0.1'));
+    }
+
+    public function test_is_allowed_caches_robots_txt_body_per_host(): void
+    {
+        Http::fake([
+            'https://example.com/robots.txt' => Http::response(
+                "User-agent: *\nAllow: /",
+                200,
+            ),
+        ]);
+
+        $service = app(RobotsService::class);
+
+        $service->isAllowed('https://example.com/article', 'TroveBot/0.1');
+        $service->isAllowed('https://example.com/another-article', 'TroveBot/0.1');
+
+        Http::assertSentCount(1);
+    }
+
+    public function test_crawl_delay_for_returns_parsed_value(): void
+    {
+        Http::fake([
+            'https://example.com/robots.txt' => Http::response(
+                "User-agent: TroveBot/0.1\nCrawl-delay: 30",
+                200,
+            ),
+        ]);
+
+        $service = app(RobotsService::class);
+
+        $this->assertSame(30, $service->crawlDelayFor('example.com', 'TroveBot/0.1'));
+    }
+
+    public function test_crawl_delay_for_returns_null_when_absent(): void
+    {
+        Http::fake([
+            'https://example.com/robots.txt' => Http::response(
+                "User-agent: *\nDisallow: /private",
+                200,
+            ),
+        ]);
+
+        $service = app(RobotsService::class);
+
+        $this->assertNull($service->crawlDelayFor('example.com', 'TroveBot/0.1'));
+    }
+}