11 - Gate ProcessCrawlJob with per-domain politeness lock

This commit is contained in:
myrmidex 2026-04-27 01:25:46 +02:00
parent 7171348370
commit 1538ceeb6e
2 changed files with 98 additions and 18 deletions

View file

@ -7,9 +7,11 @@
use App\Enums\CrawlOutcomeEnum; use App\Enums\CrawlOutcomeEnum;
use App\Enums\PageStatusEnum; use App\Enums\PageStatusEnum;
use App\Models\PageCrawl; use App\Models\PageCrawl;
use App\Services\PolitenessService;
use App\ValueObjects\FetchResult; use App\ValueObjects\FetchResult;
use Illuminate\Contracts\Queue\ShouldQueue; use Illuminate\Contracts\Queue\ShouldQueue;
use Illuminate\Foundation\Queue\Queueable; use Illuminate\Foundation\Queue\Queueable;
use Illuminate\Support\Facades\Cache;
class ProcessCrawlJob implements ShouldQueue class ProcessCrawlJob implements ShouldQueue
{ {
@ -19,10 +21,21 @@ public function __construct(
public PageCrawl $pageCrawl, public PageCrawl $pageCrawl,
) {} ) {}
public function handle( public function handle(): void
FetchPageAction $fetcher, {
RegisterDiscoveredPageAction $register, $fetcher = resolve(FetchPageAction::class);
): void { $register = resolve(RegisterDiscoveredPageAction::class);
$politenessService = resolve(PolitenessService::class);
$delay = $politenessService->minDelayFor($this->pageCrawl->domain);
$lock = Cache::lock("crawler:domain:{$this->pageCrawl->domain}", $delay);
if (! $lock->get()) {
$this->release($delay);
return;
}
/** @var FetchResult $result */ /** @var FetchResult $result */
$result = $fetcher($this->pageCrawl->page->url); $result = $fetcher($this->pageCrawl->page->url);

View file

@ -5,7 +5,6 @@
namespace Tests\Feature\Jobs; namespace Tests\Feature\Jobs;
use App\Actions\FetchPageAction; use App\Actions\FetchPageAction;
use App\Actions\RegisterDiscoveredPageAction;
use App\Enums\CrawlOutcomeEnum; use App\Enums\CrawlOutcomeEnum;
use App\Enums\PageStatusEnum; use App\Enums\PageStatusEnum;
use App\Jobs\ProcessCrawlJob; use App\Jobs\ProcessCrawlJob;
@ -15,6 +14,7 @@
use Carbon\Carbon; use Carbon\Carbon;
use Illuminate\Foundation\Testing\RefreshDatabase; use Illuminate\Foundation\Testing\RefreshDatabase;
use Illuminate\Support\Collection; use Illuminate\Support\Collection;
use Illuminate\Support\Facades\Cache;
use Illuminate\Support\Facades\Queue; use Illuminate\Support\Facades\Queue;
use Mockery; use Mockery;
use Tests\TestCase; use Tests\TestCase;
@ -56,7 +56,7 @@ public function test_handle_writes_outcome_to_page_crawl_on_success(): void
$crawl = PageCrawl::factory()->page($page)->createQuietly(); $crawl = PageCrawl::factory()->page($page)->createQuietly();
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl]) app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class)); ->handle();
$fresh = $crawl->fresh(); $fresh = $crawl->fresh();
$this->assertSame(CrawlOutcomeEnum::Success, $fresh->outcome); $this->assertSame(CrawlOutcomeEnum::Success, $fresh->outcome);
@ -76,7 +76,7 @@ public function test_handle_updates_page_to_fetched_on_success(): void
$crawl = PageCrawl::factory()->page($page)->createQuietly(); $crawl = PageCrawl::factory()->page($page)->createQuietly();
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl]) app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class)); ->handle();
$fresh = $page->fresh(); $fresh = $page->fresh();
$this->assertSame(PageStatusEnum::Fetched, $fresh->status); $this->assertSame(PageStatusEnum::Fetched, $fresh->status);
@ -95,7 +95,7 @@ public function test_handle_updates_page_to_rejected_on_rejected_outcome(): void
$crawl = PageCrawl::factory()->page($page)->createQuietly(); $crawl = PageCrawl::factory()->page($page)->createQuietly();
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl]) app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class)); ->handle();
$fresh = $page->fresh(); $fresh = $page->fresh();
$this->assertSame(PageStatusEnum::Rejected, $fresh->status); $this->assertSame(PageStatusEnum::Rejected, $fresh->status);
@ -112,7 +112,7 @@ public function test_handle_updates_page_to_failed_on_blocked_4xx(): void
$crawl = PageCrawl::factory()->page($page)->createQuietly(); $crawl = PageCrawl::factory()->page($page)->createQuietly();
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl]) app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class)); ->handle();
$fresh = $page->fresh(); $fresh = $page->fresh();
$this->assertSame(PageStatusEnum::Failed, $fresh->status); $this->assertSame(PageStatusEnum::Failed, $fresh->status);
@ -130,7 +130,7 @@ public function test_handle_updates_page_to_failed_on_timeout(): void
$crawl = PageCrawl::factory()->page($page)->createQuietly(); $crawl = PageCrawl::factory()->page($page)->createQuietly();
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl]) app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class)); ->handle();
$fresh = $page->fresh(); $fresh = $page->fresh();
$this->assertSame(PageStatusEnum::Failed, $fresh->status); $this->assertSame(PageStatusEnum::Failed, $fresh->status);
@ -148,7 +148,7 @@ public function test_handle_schedules_retry_on_transient_failure(): void
$crawl = PageCrawl::factory()->page($page)->createQuietly(); $crawl = PageCrawl::factory()->page($page)->createQuietly();
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl]) app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class)); ->handle();
// A second PageCrawl row (the retry) must have been inserted for the same page // A second PageCrawl row (the retry) must have been inserted for the same page
$this->assertSame(2, PageCrawl::where('page_id', $page->id)->count()); $this->assertSame(2, PageCrawl::where('page_id', $page->id)->count());
@ -181,7 +181,7 @@ public function test_handle_does_not_retry_after_three_attempts(): void
$thirdCrawl = PageCrawl::factory()->page($page)->createQuietly(); $thirdCrawl = PageCrawl::factory()->page($page)->createQuietly();
app(ProcessCrawlJob::class, ['pageCrawl' => $thirdCrawl]) app(ProcessCrawlJob::class, ['pageCrawl' => $thirdCrawl])
->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class)); ->handle();
// No 4th row must appear — retry cap reached // No 4th row must appear — retry cap reached
$this->assertSame(3, PageCrawl::where('page_id', $page->id)->count()); $this->assertSame(3, PageCrawl::where('page_id', $page->id)->count());
@ -200,7 +200,7 @@ public function test_handle_writes_failed_outcome_to_page_crawl(): void
$crawl = PageCrawl::factory()->page($page)->createQuietly(); $crawl = PageCrawl::factory()->page($page)->createQuietly();
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl]) app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class)); ->handle();
$this->assertDatabaseHas('page_crawls', [ $this->assertDatabaseHas('page_crawls', [
'id' => $crawl->id, 'id' => $crawl->id,
@ -220,7 +220,7 @@ public function test_handle_updates_page_to_failed_on_failed_outcome(): void
$crawl = PageCrawl::factory()->page($page)->createQuietly(); $crawl = PageCrawl::factory()->page($page)->createQuietly();
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl]) app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class)); ->handle();
$this->assertSame(PageStatusEnum::Failed, $page->fresh()->status); $this->assertSame(PageStatusEnum::Failed, $page->fresh()->status);
} }
@ -235,7 +235,7 @@ public function test_handle_updates_page_to_failed_on_blocked_5xx(): void
$crawl = PageCrawl::factory()->page($page)->createQuietly(); $crawl = PageCrawl::factory()->page($page)->createQuietly();
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl]) app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class)); ->handle();
$this->assertSame(PageStatusEnum::Failed, $page->fresh()->status); $this->assertSame(PageStatusEnum::Failed, $page->fresh()->status);
} }
@ -250,7 +250,7 @@ public function test_handle_updates_page_to_failed_on_blocked_robots(): void
$crawl = PageCrawl::factory()->page($page)->createQuietly(); $crawl = PageCrawl::factory()->page($page)->createQuietly();
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl]) app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class)); ->handle();
$this->assertSame(PageStatusEnum::Failed, $page->fresh()->status); $this->assertSame(PageStatusEnum::Failed, $page->fresh()->status);
} }
@ -269,7 +269,7 @@ public function test_handle_does_not_register_outbound_links_on_failure(): void
$crawl = PageCrawl::factory()->page($page)->createQuietly(); $crawl = PageCrawl::factory()->page($page)->createQuietly();
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl]) app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class)); ->handle();
$this->assertDatabaseMissing('pages', ['url' => 'https://should-not-be-registered.com/page']); $this->assertDatabaseMissing('pages', ['url' => 'https://should-not-be-registered.com/page']);
$this->assertSame(1, Page::count()); $this->assertSame(1, Page::count());
@ -293,13 +293,80 @@ public function test_handle_registers_outbound_links_on_success(): void
$crawl = PageCrawl::factory()->page($page)->createQuietly(); $crawl = PageCrawl::factory()->page($page)->createQuietly();
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl]) app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class)); ->handle();
$this->assertDatabaseHas('pages', ['url' => 'https://other.com/article-1']); $this->assertDatabaseHas('pages', ['url' => 'https://other.com/article-1']);
$this->assertDatabaseHas('pages', ['url' => 'https://another.com/post-2']); $this->assertDatabaseHas('pages', ['url' => 'https://another.com/post-2']);
$this->assertSame(3, Page::count()); $this->assertSame(3, Page::count());
} }
public function test_handle_releases_job_when_domain_is_locked(): void
{
Queue::fake();
// Pre-acquire the lock so the job sees it as already held
Cache::lock('crawler:domain:example.com', 10)->get();
// The fetcher must NOT be called — the job should bail before reaching it
$fetcher = Mockery::mock(FetchPageAction::class);
$fetcher->shouldNotReceive('__invoke');
$this->app->instance(FetchPageAction::class, $fetcher);
$page = Page::factory()->createQuietly(['url' => 'https://example.com/article']);
$crawl = PageCrawl::factory()->page($page)->createQuietly();
$job = new ProcessCrawlJob($crawl);
$job->handle();
// No outcome written — handle() returned early
$this->assertNull($crawl->fresh()->outcome);
// Page status unchanged from its factory default (Discovered)
$this->assertSame(PageStatusEnum::Discovered, $page->fresh()->status);
}
public function test_handle_does_not_release_lock_after_completion(): void
{
Queue::fake();
$this->mockFetchPageAction(CrawlOutcomeEnum::Success, statusCode: 200);
$page = Page::factory()->createQuietly(['url' => 'https://example.com/article']);
$crawl = PageCrawl::factory()->page($page)->createQuietly();
$job = new ProcessCrawlJob($crawl);
$job->handle();
// If handle() called $lock->release(), this second get() would succeed (true).
// It must fail (false) — the lock acquired inside handle() must still be held.
$result = Cache::lock('crawler:domain:example.com', 10)->get();
$this->assertFalse($result, 'Expected the domain lock to still be held after handle() completed, but it was free — the lock was released prematurely.');
}
public function test_handle_acquires_domain_lock_before_fetching(): void
{
Queue::fake();
$this->mockFetchPageAction(CrawlOutcomeEnum::Success, statusCode: 200);
$page = Page::factory()->createQuietly(['url' => 'https://lock-test.example.com/article']);
$crawl = PageCrawl::factory()->page($page)->createQuietly();
$domain = $crawl->domain;
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
->handle();
// The lock must still be held after handle() completes — a second attempt to acquire it fails
$this->assertFalse(
Cache::lock("crawler:domain:{$domain}", 10)->get(),
'Expected the domain lock to still be held after handle() ran, but it was free.',
);
// The fetch ran — outcome was written (proves the lock did not block execution)
$this->assertSame(CrawlOutcomeEnum::Success, $crawl->fresh()->outcome);
}
private function mockFetchPageAction( private function mockFetchPageAction(
CrawlOutcomeEnum $outcome, CrawlOutcomeEnum $outcome,
?int $statusCode = null, ?int $statusCode = null,