diff --git a/app/Jobs/ProcessCrawlJob.php b/app/Jobs/ProcessCrawlJob.php index 7b30a3f..ff700e2 100644 --- a/app/Jobs/ProcessCrawlJob.php +++ b/app/Jobs/ProcessCrawlJob.php @@ -6,9 +6,11 @@ use App\Actions\FetchPageAction; use App\Actions\RegisterDiscoveredPageAction; +use App\Enums\CrawlOutcomeEnum; use App\Enums\PageStatusEnum; use App\Models\PageCrawl; use App\Services\PolitenessService; +use App\Services\RobotsService; use App\ValueObjects\FetchResult; use Illuminate\Contracts\Queue\ShouldQueue; use Illuminate\Foundation\Queue\Queueable; @@ -24,6 +26,18 @@ public function __construct( public function handle(): void { + $robotsService = resolve(RobotsService::class); + + if (! $robotsService->isAllowed($this->pageCrawl->page->url)) { + $this->pageCrawl->update([ + 'outcome' => CrawlOutcomeEnum::BlockedRobots, + 'completed_at' => now(), + ]); + $this->pageCrawl->page->update(['status' => PageStatusEnum::Failed]); + + return; + } + $fetcher = resolve(FetchPageAction::class); $register = resolve(RegisterDiscoveredPageAction::class); $politenessService = resolve(PolitenessService::class); diff --git a/app/Services/PolitenessService.php b/app/Services/PolitenessService.php index 5114458..4d2b12b 100644 --- a/app/Services/PolitenessService.php +++ b/app/Services/PolitenessService.php @@ -8,12 +8,12 @@ class PolitenessService { public function minDelayFor(string $domain): int { - $configValue = config('crawler.min_domain_delay_seconds'); + /** @var RobotsService $robotsService */ + $robotsService = resolve(RobotsService::class); + $crawlDelay = $robotsService->crawlDelayFor($domain, config('crawler.user_agent')); - if ($configValue !== null) { - return $configValue; - } + $configValue = config('crawler.min_domain_delay_seconds', 10); - return 10; + return max($crawlDelay ?? 0, $configValue); } } diff --git a/app/Services/RobotsService.php b/app/Services/RobotsService.php new file mode 100644 index 0000000..f8b7f65 --- /dev/null +++ b/app/Services/RobotsService.php @@ -0,0 +1,60 @@ +urlService->host($url); + $path = parse_url($url, PHP_URL_PATH) ?? '/'; + + $body = Cache::remember( + "crawler:robots:{$host}", + config('crawler.robots_cache_ttl_seconds'), + function () use ($host) { + try { + $response = Http::get("https://{$host}/robots.txt"); + + return $response->successful() ? $response->body() : ''; + } catch (ConnectionException) { + return ''; + } + } + ); + + return (new RobotsTxt($body))->allows($path, $userAgent); + } + + public function crawlDelayFor(string $host, string $userAgent): ?int + { + $body = Cache::remember( + "crawler:robots:{$host}", + config('crawler.robots_cache_ttl_seconds'), + function () use ($host) { + try { + $response = Http::get("https://{$host}/robots.txt"); + + return $response->successful() ? $response->body() : ''; + } catch (ConnectionException) { + return ''; + } + } + ); + + $delay = (new RobotsTxt($body))->crawlDelay($userAgent); + + return $delay !== null ? (int) $delay : null; + } +} diff --git a/composer.json b/composer.json index de1ad17..6ba251e 100644 --- a/composer.json +++ b/composer.json @@ -21,6 +21,7 @@ "laravel/tinker": "^3.0", "livewire/livewire": "^4.2", "lvl0/fedi-discover": "@dev", + "spatie/robots-txt": "^2.5", "symfony/dom-crawler": "^7.4" }, "require-dev": { diff --git a/composer.lock b/composer.lock index e1fe116..d2b609b 100644 --- a/composer.lock +++ b/composer.lock @@ -4,7 +4,7 @@ "Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies", "This file is @generated automatically" ], - "content-hash": "2c63ed546b17b144997244f805e8a94a", + "content-hash": "707278fe3558199c1d07f11dba1d20ec", "packages": [ { "name": "brick/math", @@ -3549,6 +3549,66 @@ }, "time": "2025-12-14T04:43:48+00:00" }, + { + "name": "spatie/robots-txt", + "version": "2.5.4", + "source": { + "type": "git", + "url": "https://github.com/spatie/robots-txt.git", + "reference": "a8dd35d0a94e863f52509a366a634978e9c1db03" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/spatie/robots-txt/zipball/a8dd35d0a94e863f52509a366a634978e9c1db03", + "reference": "a8dd35d0a94e863f52509a366a634978e9c1db03", + "shasum": "" + }, + "require": { + "php": "^8.1" + }, + "require-dev": { + "phpunit/phpunit": "^11.5.2" + }, + "type": "library", + "autoload": { + "psr-4": { + "Spatie\\Robots\\": "src" + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "Brent Roose", + "email": "brent@spatie.be", + "homepage": "https://spatie.be", + "role": "Developer" + } + ], + "description": "Determine if a page may be crawled from robots.txt and robots meta tags", + "homepage": "https://github.com/spatie/robots-txt", + "keywords": [ + "robots-txt", + "spatie" + ], + "support": { + "issues": "https://github.com/spatie/robots-txt/issues", + "source": "https://github.com/spatie/robots-txt/tree/2.5.4" + }, + "funding": [ + { + "url": "https://spatie.be/open-source/support-us", + "type": "custom" + }, + { + "url": "https://github.com/spatie", + "type": "github" + } + ], + "time": "2026-02-25T07:59:20+00:00" + }, { "name": "symfony/clock", "version": "v7.4.8", diff --git a/config/crawler.php b/config/crawler.php index 108176f..f633ce5 100644 --- a/config/crawler.php +++ b/config/crawler.php @@ -43,4 +43,5 @@ 'user_agent' => env('CRAWLER_USER_AGENT', 'TroveBot/0.1 (+https://trove.lvl0.xyz/bot)'), 'min_domain_delay_seconds' => env('CRAWLER_MIN_DOMAIN_DELAY_SECONDS', 10), + 'robots_cache_ttl_seconds' => env('CRAWLER_ROBOTS_CACHE_TTL_SECONDS', 60 * 60 * 24), ]; diff --git a/tests/Feature/Jobs/ProcessCrawlJobTest.php b/tests/Feature/Jobs/ProcessCrawlJobTest.php index f504cb6..722f29f 100644 --- a/tests/Feature/Jobs/ProcessCrawlJobTest.php +++ b/tests/Feature/Jobs/ProcessCrawlJobTest.php @@ -15,6 +15,7 @@ use Illuminate\Foundation\Testing\RefreshDatabase; use Illuminate\Support\Collection; use Illuminate\Support\Facades\Cache; +use Illuminate\Support\Facades\Http; use Illuminate\Support\Facades\Queue; use Mockery; use Tests\TestCase; @@ -343,6 +344,46 @@ public function test_handle_does_not_release_lock_after_completion(): void $this->assertFalse($result, 'Expected the domain lock to still be held after handle() completed, but it was free — the lock was released prematurely.'); } + public function test_handle_writes_blocked_robots_when_disallowed(): void + { + Queue::fake(); + + Http::fake([ + 'https://example.com/robots.txt' => Http::response( + "User-agent: *\nDisallow: /", + 200, + ), + ]); + + // FetchPageAction must never be called — the robots gate returns before the lock + $fetcher = Mockery::mock(FetchPageAction::class); + $fetcher->shouldNotReceive('__invoke'); + $this->app->instance(FetchPageAction::class, $fetcher); + + $page = Page::factory()->createQuietly(['url' => 'https://example.com/private']); + $crawl = PageCrawl::factory()->page($page)->createQuietly(); + + $domain = $crawl->domain; + + app(ProcessCrawlJob::class, ['pageCrawl' => $crawl]) + ->handle(); + + // Outcome row must record BlockedRobots + $this->assertDatabaseHas('page_crawls', [ + 'id' => $crawl->id, + 'outcome' => CrawlOutcomeEnum::BlockedRobots->value, + ]); + + // Page status must be Failed (BlockedRobots::toPageStatus() === Failed) + $this->assertSame(PageStatusEnum::Failed, $page->fresh()->status); + + // The politeness lock must still be acquirable — the gate returned before ever claiming it + $this->assertTrue( + Cache::lock("crawler:domain:{$domain}", 10)->get(), + 'Expected the politeness lock to be free (gate returned before acquiring it), but it was already held.', + ); + } + public function test_handle_acquires_domain_lock_before_fetching(): void { Queue::fake(); @@ -367,6 +408,55 @@ public function test_handle_acquires_domain_lock_before_fetching(): void $this->assertSame(CrawlOutcomeEnum::Success, $crawl->fresh()->outcome); } + public function test_handle_proceeds_through_politeness_lock_when_robots_allow(): void + { + Queue::fake(); + + Http::fake([ + 'https://example.com/robots.txt' => Http::response( + "User-agent: *\nAllow: /", + 200, + ), + ]); + + // FetchPageAction must be called exactly once — robots gate passed, fetch proceeds + $fetcher = Mockery::mock(FetchPageAction::class); + $fetcher->shouldReceive('__invoke')->once()->andReturn(new FetchResult( + outcome: CrawlOutcomeEnum::Success, + statusCode: 200, + finalUrl: 'https://example.com/article', + title: 'Hello', + extractedText: 'hi', + outboundLinks: collect(), + wordCount: 1, + errorMessage: null, + )); + $this->app->instance(FetchPageAction::class, $fetcher); + + $page = Page::factory()->createQuietly(['url' => 'https://example.com/article']); + $crawl = PageCrawl::factory()->page($page)->createQuietly(); + + $domain = $crawl->domain; + + app(ProcessCrawlJob::class, ['pageCrawl' => $crawl]) + ->handle(); + + // Outcome must be Success — not BlockedRobots + $this->assertDatabaseHas('page_crawls', [ + 'id' => $crawl->id, + 'outcome' => CrawlOutcomeEnum::Success->value, + ]); + + // Page status must have advanced to Fetched + $this->assertSame(PageStatusEnum::Fetched, $page->fresh()->status); + + // Politeness lock must still be held (claimed during the fetch, never released) + $this->assertFalse( + Cache::lock("crawler:domain:{$domain}", 10)->get(), + 'Expected the politeness lock to be held after a successful fetch, but it was free.', + ); + } + private function mockFetchPageAction( CrawlOutcomeEnum $outcome, ?int $statusCode = null, diff --git a/tests/Unit/Services/PolitenessServiceTest.php b/tests/Unit/Services/PolitenessServiceTest.php index f9a2c6b..ce93fee 100644 --- a/tests/Unit/Services/PolitenessServiceTest.php +++ b/tests/Unit/Services/PolitenessServiceTest.php @@ -5,6 +5,7 @@ namespace Tests\Unit\Services; use App\Services\PolitenessService; +use Illuminate\Support\Facades\Http; use Tests\TestCase; class PolitenessServiceTest extends TestCase @@ -20,4 +21,36 @@ public function test_min_delay_for_respects_config_override(): void $this->assertSame(30, (new PolitenessService)->minDelayFor('example.com')); } + + public function test_min_delay_for_uses_robots_crawl_delay_when_higher(): void + { + Http::fake([ + 'https://example.com/robots.txt' => Http::response( + // Spatie does exact-token matching (lowercased), so the fixture UA + // must match the full string the service passes to crawlDelayFor(). + "User-agent: TroveBot/0.1 (+https://trove.lvl0.xyz/bot)\nCrawl-delay: 30", + 200, + ), + ]); + + config()->set('crawler.min_domain_delay_seconds', 10); + config()->set('crawler.user_agent', 'TroveBot/0.1 (+https://trove.lvl0.xyz/bot)'); + + $this->assertSame(30, app(PolitenessService::class)->minDelayFor('example.com')); + } + + public function test_min_delay_for_uses_config_when_higher_than_robots(): void + { + Http::fake([ + 'https://example.com/robots.txt' => Http::response( + "User-agent: TroveBot/0.1 (+https://trove.lvl0.xyz/bot)\nCrawl-delay: 10", + 200, + ), + ]); + + config()->set('crawler.min_domain_delay_seconds', 60); + config()->set('crawler.user_agent', 'TroveBot/0.1 (+https://trove.lvl0.xyz/bot)'); + + $this->assertSame(60, app(PolitenessService::class)->minDelayFor('example.com')); + } } diff --git a/tests/Unit/Services/RobotsServiceTest.php b/tests/Unit/Services/RobotsServiceTest.php new file mode 100644 index 0000000..746c173 --- /dev/null +++ b/tests/Unit/Services/RobotsServiceTest.php @@ -0,0 +1,96 @@ + Http::response( + "User-agent: *\nAllow: /", + 200, + ), + ]); + + $service = app(RobotsService::class); + + $this->assertTrue($service->isAllowed('https://example.com/article', 'TroveBot/0.1')); + } + + public function test_is_allowed_returns_false_when_robots_txt_disallows_path(): void + { + Http::fake([ + 'https://example.com/robots.txt' => Http::response( + "User-agent: *\nDisallow: /", + 200, + ), + ]); + + $service = app(RobotsService::class); + + $this->assertFalse($service->isAllowed('https://example.com/article', 'TroveBot/0.1')); + } + + public function test_is_allowed_returns_true_when_robots_txt_fetch_fails(): void + { + Http::fake([ + 'https://example.com/robots.txt' => Http::response('', 500), + ]); + + $service = app(RobotsService::class); + + $this->assertTrue($service->isAllowed('https://example.com/article', 'TroveBot/0.1')); + } + + public function test_is_allowed_caches_robots_txt_body_per_host(): void + { + Http::fake([ + 'https://example.com/robots.txt' => Http::response( + "User-agent: *\nAllow: /", + 200, + ), + ]); + + $service = app(RobotsService::class); + + $service->isAllowed('https://example.com/article', 'TroveBot/0.1'); + $service->isAllowed('https://example.com/another-article', 'TroveBot/0.1'); + + Http::assertSentCount(1); + } + + public function test_crawl_delay_for_returns_parsed_value(): void + { + Http::fake([ + 'https://example.com/robots.txt' => Http::response( + "User-agent: TroveBot/0.1\nCrawl-delay: 30", + 200, + ), + ]); + + $service = app(RobotsService::class); + + $this->assertSame(30, $service->crawlDelayFor('example.com', 'TroveBot/0.1')); + } + + public function test_crawl_delay_for_returns_null_when_absent(): void + { + Http::fake([ + 'https://example.com/robots.txt' => Http::response( + "User-agent: *\nDisallow: /private", + 200, + ), + ]); + + $service = app(RobotsService::class); + + $this->assertNull($service->crawlDelayFor('example.com', 'TroveBot/0.1')); + } +}