9 - Add robots.txt handling with cache and politeness integration

2026-04-27 23:53:52 +02:00 · 2026-04-27 23:53:52 +02:00 · cda1414cd8
commit cda1414cd8
parent 264180cd36
9 changed files with 361 additions and 6 deletions
--- a/app/Jobs/ProcessCrawlJob.php
+++ b/app/Jobs/ProcessCrawlJob.php
@ -6,9 +6,11 @@
 use App\Actions\FetchPageAction;
 use App\Actions\RegisterDiscoveredPageAction;
 use App\Enums\CrawlOutcomeEnum;
 use App\Enums\PageStatusEnum;
 use App\Models\PageCrawl;
 use App\Services\PolitenessService;
 use App\Services\RobotsService;
 use App\ValueObjects\FetchResult;
 use Illuminate\Contracts\Queue\ShouldQueue;
 use Illuminate\Foundation\Queue\Queueable;
@ -24,6 +26,18 @@ public function __construct(
    public function handle(): void
    {
        $robotsService = resolve(RobotsService::class);
        if (! $robotsService->isAllowed($this->pageCrawl->page->url)) {
            $this->pageCrawl->update([
                'outcome' => CrawlOutcomeEnum::BlockedRobots,
                'completed_at' => now(),
            ]);
            $this->pageCrawl->page->update(['status' => PageStatusEnum::Failed]);
            return;
        }
        $fetcher = resolve(FetchPageAction::class);
        $register = resolve(RegisterDiscoveredPageAction::class);
        $politenessService = resolve(PolitenessService::class);
--- a/app/Services/PolitenessService.php
+++ b/app/Services/PolitenessService.php
@ -8,12 +8,12 @@ class PolitenessService
 {
    public function minDelayFor(string $domain): int
    {
-        $configValue = config('crawler.min_domain_delay_seconds');
+        /** @var RobotsService $robotsService */
        $robotsService = resolve(RobotsService::class);
        $crawlDelay = $robotsService->crawlDelayFor($domain, config('crawler.user_agent'));
-        if ($configValue !== null) {
+        $configValue = config('crawler.min_domain_delay_seconds', 10);
            return $configValue;
        }
-        return 10;
+        return max($crawlDelay ?? 0, $configValue);
    }
 }
--- a/app/Services/RobotsService.php
+++ b/app/Services/RobotsService.php
@ -0,0 +1,60 @@
 <?php
 declare(strict_types=1);
 namespace App\Services;
 use Illuminate\Http\Client\ConnectionException;
 use Illuminate\Support\Facades\Cache;
 use Illuminate\Support\Facades\Http;
 use Spatie\Robots\RobotsTxt;
 class RobotsService
 {
    public function __construct(
        private UrlService $urlService,
    ) {}
    public function isAllowed(string $url, ?string $userAgent = null): bool
    {
        $host = $this->urlService->host($url);
        $path = parse_url($url, PHP_URL_PATH) ?? '/';
        $body = Cache::remember(
            "crawler:robots:{$host}",
            config('crawler.robots_cache_ttl_seconds'),
            function () use ($host) {
                try {
                    $response = Http::get("https://{$host}/robots.txt");
                    return $response->successful() ? $response->body() : '';
                } catch (ConnectionException) {
                    return '';
                }
            }
        );
        return (new RobotsTxt($body))->allows($path, $userAgent);
    }
    public function crawlDelayFor(string $host, string $userAgent): ?int
    {
        $body = Cache::remember(
            "crawler:robots:{$host}",
            config('crawler.robots_cache_ttl_seconds'),
            function () use ($host) {
                try {
                    $response = Http::get("https://{$host}/robots.txt");
                    return $response->successful() ? $response->body() : '';
                } catch (ConnectionException) {
                    return '';
                }
            }
        );
        $delay = (new RobotsTxt($body))->crawlDelay($userAgent);
        return $delay !== null ? (int) $delay : null;
    }
 }
--- a/composer.json
+++ b/composer.json
@ -21,6 +21,7 @@
        "laravel/tinker": "^3.0",
        "livewire/livewire": "^4.2",
        "lvl0/fedi-discover": "@dev",
        "spatie/robots-txt": "^2.5",
        "symfony/dom-crawler": "^7.4"
    },
    "require-dev": {
--- a/composer.lock
+++ b/composer.lock
@ -4,7 +4,7 @@
        "Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies",
        "This file is @generated automatically"
    ],
-    "content-hash": "2c63ed546b17b144997244f805e8a94a",
+    "content-hash": "707278fe3558199c1d07f11dba1d20ec",
    "packages": [
        {
            "name": "brick/math",
@ -3549,6 +3549,66 @@
            },
            "time": "2025-12-14T04:43:48+00:00"
        },
        {
            "name": "spatie/robots-txt",
            "version": "2.5.4",
            "source": {
                "type": "git",
                "url": "https://github.com/spatie/robots-txt.git",
                "reference": "a8dd35d0a94e863f52509a366a634978e9c1db03"
            },
            "dist": {
                "type": "zip",
                "url": "https://api.github.com/repos/spatie/robots-txt/zipball/a8dd35d0a94e863f52509a366a634978e9c1db03",
                "reference": "a8dd35d0a94e863f52509a366a634978e9c1db03",
                "shasum": ""
            },
            "require": {
                "php": "^8.1"
            },
            "require-dev": {
                "phpunit/phpunit": "^11.5.2"
            },
            "type": "library",
            "autoload": {
                "psr-4": {
                    "Spatie\\Robots\\": "src"
                }
            },
            "notification-url": "https://packagist.org/downloads/",
            "license": [
                "MIT"
            ],
            "authors": [
                {
                    "name": "Brent Roose",
                    "email": "brent@spatie.be",
                    "homepage": "https://spatie.be",
                    "role": "Developer"
                }
            ],
            "description": "Determine if a page may be crawled from robots.txt and robots meta tags",
            "homepage": "https://github.com/spatie/robots-txt",
            "keywords": [
                "robots-txt",
                "spatie"
            ],
            "support": {
                "issues": "https://github.com/spatie/robots-txt/issues",
                "source": "https://github.com/spatie/robots-txt/tree/2.5.4"
            },
            "funding": [
                {
                    "url": "https://spatie.be/open-source/support-us",
                    "type": "custom"
                },
                {
                    "url": "https://github.com/spatie",
                    "type": "github"
                }
            ],
            "time": "2026-02-25T07:59:20+00:00"
        },
        {
            "name": "symfony/clock",
            "version": "v7.4.8",
--- a/config/crawler.php
+++ b/config/crawler.php
@ -43,4 +43,5 @@
    'user_agent' => env('CRAWLER_USER_AGENT', 'TroveBot/0.1 (+https://trove.lvl0.xyz/bot)'),
    'min_domain_delay_seconds' => env('CRAWLER_MIN_DOMAIN_DELAY_SECONDS', 10),
    'robots_cache_ttl_seconds' => env('CRAWLER_ROBOTS_CACHE_TTL_SECONDS', 60 * 60 * 24),
 ];
--- a/tests/Feature/Jobs/ProcessCrawlJobTest.php
+++ b/tests/Feature/Jobs/ProcessCrawlJobTest.php
@ -15,6 +15,7 @@
 use Illuminate\Foundation\Testing\RefreshDatabase;
 use Illuminate\Support\Collection;
 use Illuminate\Support\Facades\Cache;
 use Illuminate\Support\Facades\Http;
 use Illuminate\Support\Facades\Queue;
 use Mockery;
 use Tests\TestCase;
@ -343,6 +344,46 @@ public function test_handle_does_not_release_lock_after_completion(): void
        $this->assertFalse($result, 'Expected the domain lock to still be held after handle() completed, but it was free — the lock was released prematurely.');
    }
    public function test_handle_writes_blocked_robots_when_disallowed(): void
    {
        Queue::fake();
        Http::fake([
            'https://example.com/robots.txt' => Http::response(
                "User-agent: *\nDisallow: /",
                200,
            ),
        ]);
        // FetchPageAction must never be called — the robots gate returns before the lock
        $fetcher = Mockery::mock(FetchPageAction::class);
        $fetcher->shouldNotReceive('__invoke');
        $this->app->instance(FetchPageAction::class, $fetcher);
        $page = Page::factory()->createQuietly(['url' => 'https://example.com/private']);
        $crawl = PageCrawl::factory()->page($page)->createQuietly();
        $domain = $crawl->domain;
        app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
            ->handle();
        // Outcome row must record BlockedRobots
        $this->assertDatabaseHas('page_crawls', [
            'id' => $crawl->id,
            'outcome' => CrawlOutcomeEnum::BlockedRobots->value,
        ]);
        // Page status must be Failed (BlockedRobots::toPageStatus() === Failed)
        $this->assertSame(PageStatusEnum::Failed, $page->fresh()->status);
        // The politeness lock must still be acquirable — the gate returned before ever claiming it
        $this->assertTrue(
            Cache::lock("crawler:domain:{$domain}", 10)->get(),
            'Expected the politeness lock to be free (gate returned before acquiring it), but it was already held.',
        );
    }
    public function test_handle_acquires_domain_lock_before_fetching(): void
    {
        Queue::fake();
@ -367,6 +408,55 @@ public function test_handle_acquires_domain_lock_before_fetching(): void
        $this->assertSame(CrawlOutcomeEnum::Success, $crawl->fresh()->outcome);
    }
    public function test_handle_proceeds_through_politeness_lock_when_robots_allow(): void
    {
        Queue::fake();
        Http::fake([
            'https://example.com/robots.txt' => Http::response(
                "User-agent: *\nAllow: /",
                200,
            ),
        ]);
        // FetchPageAction must be called exactly once — robots gate passed, fetch proceeds
        $fetcher = Mockery::mock(FetchPageAction::class);
        $fetcher->shouldReceive('__invoke')->once()->andReturn(new FetchResult(
            outcome: CrawlOutcomeEnum::Success,
            statusCode: 200,
            finalUrl: 'https://example.com/article',
            title: 'Hello',
            extractedText: 'hi',
            outboundLinks: collect(),
            wordCount: 1,
            errorMessage: null,
        ));
        $this->app->instance(FetchPageAction::class, $fetcher);
        $page = Page::factory()->createQuietly(['url' => 'https://example.com/article']);
        $crawl = PageCrawl::factory()->page($page)->createQuietly();
        $domain = $crawl->domain;
        app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
            ->handle();
        // Outcome must be Success — not BlockedRobots
        $this->assertDatabaseHas('page_crawls', [
            'id' => $crawl->id,
            'outcome' => CrawlOutcomeEnum::Success->value,
        ]);
        // Page status must have advanced to Fetched
        $this->assertSame(PageStatusEnum::Fetched, $page->fresh()->status);
        // Politeness lock must still be held (claimed during the fetch, never released)
        $this->assertFalse(
            Cache::lock("crawler:domain:{$domain}", 10)->get(),
            'Expected the politeness lock to be held after a successful fetch, but it was free.',
        );
    }
    private function mockFetchPageAction(
        CrawlOutcomeEnum $outcome,
        ?int $statusCode = null,
--- a/tests/Unit/Services/PolitenessServiceTest.php
+++ b/tests/Unit/Services/PolitenessServiceTest.php
@ -5,6 +5,7 @@
 namespace Tests\Unit\Services;
 use App\Services\PolitenessService;
 use Illuminate\Support\Facades\Http;
 use Tests\TestCase;
 class PolitenessServiceTest extends TestCase
@ -20,4 +21,36 @@ public function test_min_delay_for_respects_config_override(): void
        $this->assertSame(30, (new PolitenessService)->minDelayFor('example.com'));
    }
    public function test_min_delay_for_uses_robots_crawl_delay_when_higher(): void
    {
        Http::fake([
            'https://example.com/robots.txt' => Http::response(
                // Spatie does exact-token matching (lowercased), so the fixture UA
                // must match the full string the service passes to crawlDelayFor().
                "User-agent: TroveBot/0.1 (+https://trove.lvl0.xyz/bot)\nCrawl-delay: 30",
                200,
            ),
        ]);
        config()->set('crawler.min_domain_delay_seconds', 10);
        config()->set('crawler.user_agent', 'TroveBot/0.1 (+https://trove.lvl0.xyz/bot)');
        $this->assertSame(30, app(PolitenessService::class)->minDelayFor('example.com'));
    }
    public function test_min_delay_for_uses_config_when_higher_than_robots(): void
    {
        Http::fake([
            'https://example.com/robots.txt' => Http::response(
                "User-agent: TroveBot/0.1 (+https://trove.lvl0.xyz/bot)\nCrawl-delay: 10",
                200,
            ),
        ]);
        config()->set('crawler.min_domain_delay_seconds', 60);
        config()->set('crawler.user_agent', 'TroveBot/0.1 (+https://trove.lvl0.xyz/bot)');
        $this->assertSame(60, app(PolitenessService::class)->minDelayFor('example.com'));
    }
 }
--- a/tests/Unit/Services/RobotsServiceTest.php
+++ b/tests/Unit/Services/RobotsServiceTest.php
@ -0,0 +1,96 @@
 <?php
 declare(strict_types=1);
 namespace Tests\Unit\Services;
 use App\Services\RobotsService;
 use Illuminate\Support\Facades\Http;
 use Tests\TestCase;
 class RobotsServiceTest extends TestCase
 {
    public function test_is_allowed_returns_true_when_robots_txt_allows_path(): void
    {
        Http::fake([
            'https://example.com/robots.txt' => Http::response(
                "User-agent: *\nAllow: /",
                200,
            ),
        ]);
        $service = app(RobotsService::class);
        $this->assertTrue($service->isAllowed('https://example.com/article', 'TroveBot/0.1'));
    }
    public function test_is_allowed_returns_false_when_robots_txt_disallows_path(): void
    {
        Http::fake([
            'https://example.com/robots.txt' => Http::response(
                "User-agent: *\nDisallow: /",
                200,
            ),
        ]);
        $service = app(RobotsService::class);
        $this->assertFalse($service->isAllowed('https://example.com/article', 'TroveBot/0.1'));
    }
    public function test_is_allowed_returns_true_when_robots_txt_fetch_fails(): void
    {
        Http::fake([
            'https://example.com/robots.txt' => Http::response('', 500),
        ]);
        $service = app(RobotsService::class);
        $this->assertTrue($service->isAllowed('https://example.com/article', 'TroveBot/0.1'));
    }
    public function test_is_allowed_caches_robots_txt_body_per_host(): void
    {
        Http::fake([
            'https://example.com/robots.txt' => Http::response(
                "User-agent: *\nAllow: /",
                200,
            ),
        ]);
        $service = app(RobotsService::class);
        $service->isAllowed('https://example.com/article', 'TroveBot/0.1');
        $service->isAllowed('https://example.com/another-article', 'TroveBot/0.1');
        Http::assertSentCount(1);
    }
    public function test_crawl_delay_for_returns_parsed_value(): void
    {
        Http::fake([
            'https://example.com/robots.txt' => Http::response(
                "User-agent: TroveBot/0.1\nCrawl-delay: 30",
                200,
            ),
        ]);
        $service = app(RobotsService::class);
        $this->assertSame(30, $service->crawlDelayFor('example.com', 'TroveBot/0.1'));
    }
    public function test_crawl_delay_for_returns_null_when_absent(): void
    {
        Http::fake([
            'https://example.com/robots.txt' => Http::response(
                "User-agent: *\nDisallow: /private",
                200,
            ),
        ]);
        $service = app(RobotsService::class);
        $this->assertNull($service->crawlDelayFor('example.com', 'TroveBot/0.1'));
    }
 }