From 1538ceeb6e8cfaf7d4e918ae2c139e95d2c3fbaa Mon Sep 17 00:00:00 2001 From: myrmidex Date: Mon, 27 Apr 2026 01:25:46 +0200 Subject: [PATCH] 11 - Gate ProcessCrawlJob with per-domain politeness lock --- app/Jobs/ProcessCrawlJob.php | 21 ++++- tests/Feature/Jobs/ProcessCrawlJobTest.php | 95 ++++++++++++++++++---- 2 files changed, 98 insertions(+), 18 deletions(-) diff --git a/app/Jobs/ProcessCrawlJob.php b/app/Jobs/ProcessCrawlJob.php index 11a2993..d2928d0 100644 --- a/app/Jobs/ProcessCrawlJob.php +++ b/app/Jobs/ProcessCrawlJob.php @@ -7,9 +7,11 @@ use App\Enums\CrawlOutcomeEnum; use App\Enums\PageStatusEnum; use App\Models\PageCrawl; +use App\Services\PolitenessService; use App\ValueObjects\FetchResult; use Illuminate\Contracts\Queue\ShouldQueue; use Illuminate\Foundation\Queue\Queueable; +use Illuminate\Support\Facades\Cache; class ProcessCrawlJob implements ShouldQueue { @@ -19,10 +21,21 @@ public function __construct( public PageCrawl $pageCrawl, ) {} - public function handle( - FetchPageAction $fetcher, - RegisterDiscoveredPageAction $register, - ): void { + public function handle(): void + { + $fetcher = resolve(FetchPageAction::class); + $register = resolve(RegisterDiscoveredPageAction::class); + $politenessService = resolve(PolitenessService::class); + + $delay = $politenessService->minDelayFor($this->pageCrawl->domain); + $lock = Cache::lock("crawler:domain:{$this->pageCrawl->domain}", $delay); + + if (! $lock->get()) { + $this->release($delay); + + return; + } + /** @var FetchResult $result */ $result = $fetcher($this->pageCrawl->page->url); diff --git a/tests/Feature/Jobs/ProcessCrawlJobTest.php b/tests/Feature/Jobs/ProcessCrawlJobTest.php index 487f6d7..f504cb6 100644 --- a/tests/Feature/Jobs/ProcessCrawlJobTest.php +++ b/tests/Feature/Jobs/ProcessCrawlJobTest.php @@ -5,7 +5,6 @@ namespace Tests\Feature\Jobs; use App\Actions\FetchPageAction; -use App\Actions\RegisterDiscoveredPageAction; use App\Enums\CrawlOutcomeEnum; use App\Enums\PageStatusEnum; use App\Jobs\ProcessCrawlJob; @@ -15,6 +14,7 @@ use Carbon\Carbon; use Illuminate\Foundation\Testing\RefreshDatabase; use Illuminate\Support\Collection; +use Illuminate\Support\Facades\Cache; use Illuminate\Support\Facades\Queue; use Mockery; use Tests\TestCase; @@ -56,7 +56,7 @@ public function test_handle_writes_outcome_to_page_crawl_on_success(): void $crawl = PageCrawl::factory()->page($page)->createQuietly(); app(ProcessCrawlJob::class, ['pageCrawl' => $crawl]) - ->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class)); + ->handle(); $fresh = $crawl->fresh(); $this->assertSame(CrawlOutcomeEnum::Success, $fresh->outcome); @@ -76,7 +76,7 @@ public function test_handle_updates_page_to_fetched_on_success(): void $crawl = PageCrawl::factory()->page($page)->createQuietly(); app(ProcessCrawlJob::class, ['pageCrawl' => $crawl]) - ->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class)); + ->handle(); $fresh = $page->fresh(); $this->assertSame(PageStatusEnum::Fetched, $fresh->status); @@ -95,7 +95,7 @@ public function test_handle_updates_page_to_rejected_on_rejected_outcome(): void $crawl = PageCrawl::factory()->page($page)->createQuietly(); app(ProcessCrawlJob::class, ['pageCrawl' => $crawl]) - ->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class)); + ->handle(); $fresh = $page->fresh(); $this->assertSame(PageStatusEnum::Rejected, $fresh->status); @@ -112,7 +112,7 @@ public function test_handle_updates_page_to_failed_on_blocked_4xx(): void $crawl = PageCrawl::factory()->page($page)->createQuietly(); app(ProcessCrawlJob::class, ['pageCrawl' => $crawl]) - ->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class)); + ->handle(); $fresh = $page->fresh(); $this->assertSame(PageStatusEnum::Failed, $fresh->status); @@ -130,7 +130,7 @@ public function test_handle_updates_page_to_failed_on_timeout(): void $crawl = PageCrawl::factory()->page($page)->createQuietly(); app(ProcessCrawlJob::class, ['pageCrawl' => $crawl]) - ->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class)); + ->handle(); $fresh = $page->fresh(); $this->assertSame(PageStatusEnum::Failed, $fresh->status); @@ -148,7 +148,7 @@ public function test_handle_schedules_retry_on_transient_failure(): void $crawl = PageCrawl::factory()->page($page)->createQuietly(); app(ProcessCrawlJob::class, ['pageCrawl' => $crawl]) - ->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class)); + ->handle(); // A second PageCrawl row (the retry) must have been inserted for the same page $this->assertSame(2, PageCrawl::where('page_id', $page->id)->count()); @@ -181,7 +181,7 @@ public function test_handle_does_not_retry_after_three_attempts(): void $thirdCrawl = PageCrawl::factory()->page($page)->createQuietly(); app(ProcessCrawlJob::class, ['pageCrawl' => $thirdCrawl]) - ->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class)); + ->handle(); // No 4th row must appear — retry cap reached $this->assertSame(3, PageCrawl::where('page_id', $page->id)->count()); @@ -200,7 +200,7 @@ public function test_handle_writes_failed_outcome_to_page_crawl(): void $crawl = PageCrawl::factory()->page($page)->createQuietly(); app(ProcessCrawlJob::class, ['pageCrawl' => $crawl]) - ->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class)); + ->handle(); $this->assertDatabaseHas('page_crawls', [ 'id' => $crawl->id, @@ -220,7 +220,7 @@ public function test_handle_updates_page_to_failed_on_failed_outcome(): void $crawl = PageCrawl::factory()->page($page)->createQuietly(); app(ProcessCrawlJob::class, ['pageCrawl' => $crawl]) - ->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class)); + ->handle(); $this->assertSame(PageStatusEnum::Failed, $page->fresh()->status); } @@ -235,7 +235,7 @@ public function test_handle_updates_page_to_failed_on_blocked_5xx(): void $crawl = PageCrawl::factory()->page($page)->createQuietly(); app(ProcessCrawlJob::class, ['pageCrawl' => $crawl]) - ->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class)); + ->handle(); $this->assertSame(PageStatusEnum::Failed, $page->fresh()->status); } @@ -250,7 +250,7 @@ public function test_handle_updates_page_to_failed_on_blocked_robots(): void $crawl = PageCrawl::factory()->page($page)->createQuietly(); app(ProcessCrawlJob::class, ['pageCrawl' => $crawl]) - ->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class)); + ->handle(); $this->assertSame(PageStatusEnum::Failed, $page->fresh()->status); } @@ -269,7 +269,7 @@ public function test_handle_does_not_register_outbound_links_on_failure(): void $crawl = PageCrawl::factory()->page($page)->createQuietly(); app(ProcessCrawlJob::class, ['pageCrawl' => $crawl]) - ->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class)); + ->handle(); $this->assertDatabaseMissing('pages', ['url' => 'https://should-not-be-registered.com/page']); $this->assertSame(1, Page::count()); @@ -293,13 +293,80 @@ public function test_handle_registers_outbound_links_on_success(): void $crawl = PageCrawl::factory()->page($page)->createQuietly(); app(ProcessCrawlJob::class, ['pageCrawl' => $crawl]) - ->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class)); + ->handle(); $this->assertDatabaseHas('pages', ['url' => 'https://other.com/article-1']); $this->assertDatabaseHas('pages', ['url' => 'https://another.com/post-2']); $this->assertSame(3, Page::count()); } + public function test_handle_releases_job_when_domain_is_locked(): void + { + Queue::fake(); + + // Pre-acquire the lock so the job sees it as already held + Cache::lock('crawler:domain:example.com', 10)->get(); + + // The fetcher must NOT be called — the job should bail before reaching it + $fetcher = Mockery::mock(FetchPageAction::class); + $fetcher->shouldNotReceive('__invoke'); + $this->app->instance(FetchPageAction::class, $fetcher); + + $page = Page::factory()->createQuietly(['url' => 'https://example.com/article']); + $crawl = PageCrawl::factory()->page($page)->createQuietly(); + + $job = new ProcessCrawlJob($crawl); + $job->handle(); + + // No outcome written — handle() returned early + $this->assertNull($crawl->fresh()->outcome); + + // Page status unchanged from its factory default (Discovered) + $this->assertSame(PageStatusEnum::Discovered, $page->fresh()->status); + } + + public function test_handle_does_not_release_lock_after_completion(): void + { + Queue::fake(); + + $this->mockFetchPageAction(CrawlOutcomeEnum::Success, statusCode: 200); + + $page = Page::factory()->createQuietly(['url' => 'https://example.com/article']); + $crawl = PageCrawl::factory()->page($page)->createQuietly(); + + $job = new ProcessCrawlJob($crawl); + $job->handle(); + + // If handle() called $lock->release(), this second get() would succeed (true). + // It must fail (false) — the lock acquired inside handle() must still be held. + $result = Cache::lock('crawler:domain:example.com', 10)->get(); + $this->assertFalse($result, 'Expected the domain lock to still be held after handle() completed, but it was free — the lock was released prematurely.'); + } + + public function test_handle_acquires_domain_lock_before_fetching(): void + { + Queue::fake(); + + $this->mockFetchPageAction(CrawlOutcomeEnum::Success, statusCode: 200); + + $page = Page::factory()->createQuietly(['url' => 'https://lock-test.example.com/article']); + $crawl = PageCrawl::factory()->page($page)->createQuietly(); + + $domain = $crawl->domain; + + app(ProcessCrawlJob::class, ['pageCrawl' => $crawl]) + ->handle(); + + // The lock must still be held after handle() completes — a second attempt to acquire it fails + $this->assertFalse( + Cache::lock("crawler:domain:{$domain}", 10)->get(), + 'Expected the domain lock to still be held after handle() ran, but it was free.', + ); + + // The fetch ran — outcome was written (proves the lock did not block execution) + $this->assertSame(CrawlOutcomeEnum::Success, $crawl->fresh()->outcome); + } + private function mockFetchPageAction( CrawlOutcomeEnum $outcome, ?int $statusCode = null,