11 - Gate ProcessCrawlJob with per-domain politeness lock

This commit is contained in:
myrmidex 2026-04-27 01:25:46 +02:00
parent 7171348370
commit 1538ceeb6e
2 changed files with 98 additions and 18 deletions

View file

@ -7,9 +7,11 @@
use App\Enums\CrawlOutcomeEnum;
use App\Enums\PageStatusEnum;
use App\Models\PageCrawl;
use App\Services\PolitenessService;
use App\ValueObjects\FetchResult;
use Illuminate\Contracts\Queue\ShouldQueue;
use Illuminate\Foundation\Queue\Queueable;
use Illuminate\Support\Facades\Cache;
class ProcessCrawlJob implements ShouldQueue
{
@ -19,10 +21,21 @@ public function __construct(
public PageCrawl $pageCrawl,
) {}
public function handle(
FetchPageAction $fetcher,
RegisterDiscoveredPageAction $register,
): void {
public function handle(): void
{
$fetcher = resolve(FetchPageAction::class);
$register = resolve(RegisterDiscoveredPageAction::class);
$politenessService = resolve(PolitenessService::class);
$delay = $politenessService->minDelayFor($this->pageCrawl->domain);
$lock = Cache::lock("crawler:domain:{$this->pageCrawl->domain}", $delay);
if (! $lock->get()) {
$this->release($delay);
return;
}
/** @var FetchResult $result */
$result = $fetcher($this->pageCrawl->page->url);

View file

@ -5,7 +5,6 @@
namespace Tests\Feature\Jobs;
use App\Actions\FetchPageAction;
use App\Actions\RegisterDiscoveredPageAction;
use App\Enums\CrawlOutcomeEnum;
use App\Enums\PageStatusEnum;
use App\Jobs\ProcessCrawlJob;
@ -15,6 +14,7 @@
use Carbon\Carbon;
use Illuminate\Foundation\Testing\RefreshDatabase;
use Illuminate\Support\Collection;
use Illuminate\Support\Facades\Cache;
use Illuminate\Support\Facades\Queue;
use Mockery;
use Tests\TestCase;
@ -56,7 +56,7 @@ public function test_handle_writes_outcome_to_page_crawl_on_success(): void
$crawl = PageCrawl::factory()->page($page)->createQuietly();
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class));
->handle();
$fresh = $crawl->fresh();
$this->assertSame(CrawlOutcomeEnum::Success, $fresh->outcome);
@ -76,7 +76,7 @@ public function test_handle_updates_page_to_fetched_on_success(): void
$crawl = PageCrawl::factory()->page($page)->createQuietly();
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class));
->handle();
$fresh = $page->fresh();
$this->assertSame(PageStatusEnum::Fetched, $fresh->status);
@ -95,7 +95,7 @@ public function test_handle_updates_page_to_rejected_on_rejected_outcome(): void
$crawl = PageCrawl::factory()->page($page)->createQuietly();
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class));
->handle();
$fresh = $page->fresh();
$this->assertSame(PageStatusEnum::Rejected, $fresh->status);
@ -112,7 +112,7 @@ public function test_handle_updates_page_to_failed_on_blocked_4xx(): void
$crawl = PageCrawl::factory()->page($page)->createQuietly();
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class));
->handle();
$fresh = $page->fresh();
$this->assertSame(PageStatusEnum::Failed, $fresh->status);
@ -130,7 +130,7 @@ public function test_handle_updates_page_to_failed_on_timeout(): void
$crawl = PageCrawl::factory()->page($page)->createQuietly();
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class));
->handle();
$fresh = $page->fresh();
$this->assertSame(PageStatusEnum::Failed, $fresh->status);
@ -148,7 +148,7 @@ public function test_handle_schedules_retry_on_transient_failure(): void
$crawl = PageCrawl::factory()->page($page)->createQuietly();
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class));
->handle();
// A second PageCrawl row (the retry) must have been inserted for the same page
$this->assertSame(2, PageCrawl::where('page_id', $page->id)->count());
@ -181,7 +181,7 @@ public function test_handle_does_not_retry_after_three_attempts(): void
$thirdCrawl = PageCrawl::factory()->page($page)->createQuietly();
app(ProcessCrawlJob::class, ['pageCrawl' => $thirdCrawl])
->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class));
->handle();
// No 4th row must appear — retry cap reached
$this->assertSame(3, PageCrawl::where('page_id', $page->id)->count());
@ -200,7 +200,7 @@ public function test_handle_writes_failed_outcome_to_page_crawl(): void
$crawl = PageCrawl::factory()->page($page)->createQuietly();
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class));
->handle();
$this->assertDatabaseHas('page_crawls', [
'id' => $crawl->id,
@ -220,7 +220,7 @@ public function test_handle_updates_page_to_failed_on_failed_outcome(): void
$crawl = PageCrawl::factory()->page($page)->createQuietly();
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class));
->handle();
$this->assertSame(PageStatusEnum::Failed, $page->fresh()->status);
}
@ -235,7 +235,7 @@ public function test_handle_updates_page_to_failed_on_blocked_5xx(): void
$crawl = PageCrawl::factory()->page($page)->createQuietly();
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class));
->handle();
$this->assertSame(PageStatusEnum::Failed, $page->fresh()->status);
}
@ -250,7 +250,7 @@ public function test_handle_updates_page_to_failed_on_blocked_robots(): void
$crawl = PageCrawl::factory()->page($page)->createQuietly();
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class));
->handle();
$this->assertSame(PageStatusEnum::Failed, $page->fresh()->status);
}
@ -269,7 +269,7 @@ public function test_handle_does_not_register_outbound_links_on_failure(): void
$crawl = PageCrawl::factory()->page($page)->createQuietly();
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class));
->handle();
$this->assertDatabaseMissing('pages', ['url' => 'https://should-not-be-registered.com/page']);
$this->assertSame(1, Page::count());
@ -293,13 +293,80 @@ public function test_handle_registers_outbound_links_on_success(): void
$crawl = PageCrawl::factory()->page($page)->createQuietly();
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class));
->handle();
$this->assertDatabaseHas('pages', ['url' => 'https://other.com/article-1']);
$this->assertDatabaseHas('pages', ['url' => 'https://another.com/post-2']);
$this->assertSame(3, Page::count());
}
public function test_handle_releases_job_when_domain_is_locked(): void
{
Queue::fake();
// Pre-acquire the lock so the job sees it as already held
Cache::lock('crawler:domain:example.com', 10)->get();
// The fetcher must NOT be called — the job should bail before reaching it
$fetcher = Mockery::mock(FetchPageAction::class);
$fetcher->shouldNotReceive('__invoke');
$this->app->instance(FetchPageAction::class, $fetcher);
$page = Page::factory()->createQuietly(['url' => 'https://example.com/article']);
$crawl = PageCrawl::factory()->page($page)->createQuietly();
$job = new ProcessCrawlJob($crawl);
$job->handle();
// No outcome written — handle() returned early
$this->assertNull($crawl->fresh()->outcome);
// Page status unchanged from its factory default (Discovered)
$this->assertSame(PageStatusEnum::Discovered, $page->fresh()->status);
}
public function test_handle_does_not_release_lock_after_completion(): void
{
Queue::fake();
$this->mockFetchPageAction(CrawlOutcomeEnum::Success, statusCode: 200);
$page = Page::factory()->createQuietly(['url' => 'https://example.com/article']);
$crawl = PageCrawl::factory()->page($page)->createQuietly();
$job = new ProcessCrawlJob($crawl);
$job->handle();
// If handle() called $lock->release(), this second get() would succeed (true).
// It must fail (false) — the lock acquired inside handle() must still be held.
$result = Cache::lock('crawler:domain:example.com', 10)->get();
$this->assertFalse($result, 'Expected the domain lock to still be held after handle() completed, but it was free — the lock was released prematurely.');
}
public function test_handle_acquires_domain_lock_before_fetching(): void
{
Queue::fake();
$this->mockFetchPageAction(CrawlOutcomeEnum::Success, statusCode: 200);
$page = Page::factory()->createQuietly(['url' => 'https://lock-test.example.com/article']);
$crawl = PageCrawl::factory()->page($page)->createQuietly();
$domain = $crawl->domain;
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
->handle();
// The lock must still be held after handle() completes — a second attempt to acquire it fails
$this->assertFalse(
Cache::lock("crawler:domain:{$domain}", 10)->get(),
'Expected the domain lock to still be held after handle() ran, but it was free.',
);
// The fetch ran — outcome was written (proves the lock did not block execution)
$this->assertSame(CrawlOutcomeEnum::Success, $crawl->fresh()->outcome);
}
private function mockFetchPageAction(
CrawlOutcomeEnum $outcome,
?int $statusCode = null,