11 - Gate ProcessCrawlJob with per-domain politeness lock
This commit is contained in:
parent
7171348370
commit
1538ceeb6e
2 changed files with 98 additions and 18 deletions
|
|
@ -7,9 +7,11 @@
|
||||||
use App\Enums\CrawlOutcomeEnum;
|
use App\Enums\CrawlOutcomeEnum;
|
||||||
use App\Enums\PageStatusEnum;
|
use App\Enums\PageStatusEnum;
|
||||||
use App\Models\PageCrawl;
|
use App\Models\PageCrawl;
|
||||||
|
use App\Services\PolitenessService;
|
||||||
use App\ValueObjects\FetchResult;
|
use App\ValueObjects\FetchResult;
|
||||||
use Illuminate\Contracts\Queue\ShouldQueue;
|
use Illuminate\Contracts\Queue\ShouldQueue;
|
||||||
use Illuminate\Foundation\Queue\Queueable;
|
use Illuminate\Foundation\Queue\Queueable;
|
||||||
|
use Illuminate\Support\Facades\Cache;
|
||||||
|
|
||||||
class ProcessCrawlJob implements ShouldQueue
|
class ProcessCrawlJob implements ShouldQueue
|
||||||
{
|
{
|
||||||
|
|
@ -19,10 +21,21 @@ public function __construct(
|
||||||
public PageCrawl $pageCrawl,
|
public PageCrawl $pageCrawl,
|
||||||
) {}
|
) {}
|
||||||
|
|
||||||
public function handle(
|
public function handle(): void
|
||||||
FetchPageAction $fetcher,
|
{
|
||||||
RegisterDiscoveredPageAction $register,
|
$fetcher = resolve(FetchPageAction::class);
|
||||||
): void {
|
$register = resolve(RegisterDiscoveredPageAction::class);
|
||||||
|
$politenessService = resolve(PolitenessService::class);
|
||||||
|
|
||||||
|
$delay = $politenessService->minDelayFor($this->pageCrawl->domain);
|
||||||
|
$lock = Cache::lock("crawler:domain:{$this->pageCrawl->domain}", $delay);
|
||||||
|
|
||||||
|
if (! $lock->get()) {
|
||||||
|
$this->release($delay);
|
||||||
|
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
/** @var FetchResult $result */
|
/** @var FetchResult $result */
|
||||||
$result = $fetcher($this->pageCrawl->page->url);
|
$result = $fetcher($this->pageCrawl->page->url);
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -5,7 +5,6 @@
|
||||||
namespace Tests\Feature\Jobs;
|
namespace Tests\Feature\Jobs;
|
||||||
|
|
||||||
use App\Actions\FetchPageAction;
|
use App\Actions\FetchPageAction;
|
||||||
use App\Actions\RegisterDiscoveredPageAction;
|
|
||||||
use App\Enums\CrawlOutcomeEnum;
|
use App\Enums\CrawlOutcomeEnum;
|
||||||
use App\Enums\PageStatusEnum;
|
use App\Enums\PageStatusEnum;
|
||||||
use App\Jobs\ProcessCrawlJob;
|
use App\Jobs\ProcessCrawlJob;
|
||||||
|
|
@ -15,6 +14,7 @@
|
||||||
use Carbon\Carbon;
|
use Carbon\Carbon;
|
||||||
use Illuminate\Foundation\Testing\RefreshDatabase;
|
use Illuminate\Foundation\Testing\RefreshDatabase;
|
||||||
use Illuminate\Support\Collection;
|
use Illuminate\Support\Collection;
|
||||||
|
use Illuminate\Support\Facades\Cache;
|
||||||
use Illuminate\Support\Facades\Queue;
|
use Illuminate\Support\Facades\Queue;
|
||||||
use Mockery;
|
use Mockery;
|
||||||
use Tests\TestCase;
|
use Tests\TestCase;
|
||||||
|
|
@ -56,7 +56,7 @@ public function test_handle_writes_outcome_to_page_crawl_on_success(): void
|
||||||
$crawl = PageCrawl::factory()->page($page)->createQuietly();
|
$crawl = PageCrawl::factory()->page($page)->createQuietly();
|
||||||
|
|
||||||
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
|
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
|
||||||
->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class));
|
->handle();
|
||||||
|
|
||||||
$fresh = $crawl->fresh();
|
$fresh = $crawl->fresh();
|
||||||
$this->assertSame(CrawlOutcomeEnum::Success, $fresh->outcome);
|
$this->assertSame(CrawlOutcomeEnum::Success, $fresh->outcome);
|
||||||
|
|
@ -76,7 +76,7 @@ public function test_handle_updates_page_to_fetched_on_success(): void
|
||||||
$crawl = PageCrawl::factory()->page($page)->createQuietly();
|
$crawl = PageCrawl::factory()->page($page)->createQuietly();
|
||||||
|
|
||||||
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
|
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
|
||||||
->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class));
|
->handle();
|
||||||
|
|
||||||
$fresh = $page->fresh();
|
$fresh = $page->fresh();
|
||||||
$this->assertSame(PageStatusEnum::Fetched, $fresh->status);
|
$this->assertSame(PageStatusEnum::Fetched, $fresh->status);
|
||||||
|
|
@ -95,7 +95,7 @@ public function test_handle_updates_page_to_rejected_on_rejected_outcome(): void
|
||||||
$crawl = PageCrawl::factory()->page($page)->createQuietly();
|
$crawl = PageCrawl::factory()->page($page)->createQuietly();
|
||||||
|
|
||||||
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
|
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
|
||||||
->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class));
|
->handle();
|
||||||
|
|
||||||
$fresh = $page->fresh();
|
$fresh = $page->fresh();
|
||||||
$this->assertSame(PageStatusEnum::Rejected, $fresh->status);
|
$this->assertSame(PageStatusEnum::Rejected, $fresh->status);
|
||||||
|
|
@ -112,7 +112,7 @@ public function test_handle_updates_page_to_failed_on_blocked_4xx(): void
|
||||||
$crawl = PageCrawl::factory()->page($page)->createQuietly();
|
$crawl = PageCrawl::factory()->page($page)->createQuietly();
|
||||||
|
|
||||||
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
|
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
|
||||||
->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class));
|
->handle();
|
||||||
|
|
||||||
$fresh = $page->fresh();
|
$fresh = $page->fresh();
|
||||||
$this->assertSame(PageStatusEnum::Failed, $fresh->status);
|
$this->assertSame(PageStatusEnum::Failed, $fresh->status);
|
||||||
|
|
@ -130,7 +130,7 @@ public function test_handle_updates_page_to_failed_on_timeout(): void
|
||||||
$crawl = PageCrawl::factory()->page($page)->createQuietly();
|
$crawl = PageCrawl::factory()->page($page)->createQuietly();
|
||||||
|
|
||||||
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
|
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
|
||||||
->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class));
|
->handle();
|
||||||
|
|
||||||
$fresh = $page->fresh();
|
$fresh = $page->fresh();
|
||||||
$this->assertSame(PageStatusEnum::Failed, $fresh->status);
|
$this->assertSame(PageStatusEnum::Failed, $fresh->status);
|
||||||
|
|
@ -148,7 +148,7 @@ public function test_handle_schedules_retry_on_transient_failure(): void
|
||||||
$crawl = PageCrawl::factory()->page($page)->createQuietly();
|
$crawl = PageCrawl::factory()->page($page)->createQuietly();
|
||||||
|
|
||||||
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
|
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
|
||||||
->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class));
|
->handle();
|
||||||
|
|
||||||
// A second PageCrawl row (the retry) must have been inserted for the same page
|
// A second PageCrawl row (the retry) must have been inserted for the same page
|
||||||
$this->assertSame(2, PageCrawl::where('page_id', $page->id)->count());
|
$this->assertSame(2, PageCrawl::where('page_id', $page->id)->count());
|
||||||
|
|
@ -181,7 +181,7 @@ public function test_handle_does_not_retry_after_three_attempts(): void
|
||||||
$thirdCrawl = PageCrawl::factory()->page($page)->createQuietly();
|
$thirdCrawl = PageCrawl::factory()->page($page)->createQuietly();
|
||||||
|
|
||||||
app(ProcessCrawlJob::class, ['pageCrawl' => $thirdCrawl])
|
app(ProcessCrawlJob::class, ['pageCrawl' => $thirdCrawl])
|
||||||
->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class));
|
->handle();
|
||||||
|
|
||||||
// No 4th row must appear — retry cap reached
|
// No 4th row must appear — retry cap reached
|
||||||
$this->assertSame(3, PageCrawl::where('page_id', $page->id)->count());
|
$this->assertSame(3, PageCrawl::where('page_id', $page->id)->count());
|
||||||
|
|
@ -200,7 +200,7 @@ public function test_handle_writes_failed_outcome_to_page_crawl(): void
|
||||||
$crawl = PageCrawl::factory()->page($page)->createQuietly();
|
$crawl = PageCrawl::factory()->page($page)->createQuietly();
|
||||||
|
|
||||||
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
|
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
|
||||||
->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class));
|
->handle();
|
||||||
|
|
||||||
$this->assertDatabaseHas('page_crawls', [
|
$this->assertDatabaseHas('page_crawls', [
|
||||||
'id' => $crawl->id,
|
'id' => $crawl->id,
|
||||||
|
|
@ -220,7 +220,7 @@ public function test_handle_updates_page_to_failed_on_failed_outcome(): void
|
||||||
$crawl = PageCrawl::factory()->page($page)->createQuietly();
|
$crawl = PageCrawl::factory()->page($page)->createQuietly();
|
||||||
|
|
||||||
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
|
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
|
||||||
->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class));
|
->handle();
|
||||||
|
|
||||||
$this->assertSame(PageStatusEnum::Failed, $page->fresh()->status);
|
$this->assertSame(PageStatusEnum::Failed, $page->fresh()->status);
|
||||||
}
|
}
|
||||||
|
|
@ -235,7 +235,7 @@ public function test_handle_updates_page_to_failed_on_blocked_5xx(): void
|
||||||
$crawl = PageCrawl::factory()->page($page)->createQuietly();
|
$crawl = PageCrawl::factory()->page($page)->createQuietly();
|
||||||
|
|
||||||
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
|
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
|
||||||
->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class));
|
->handle();
|
||||||
|
|
||||||
$this->assertSame(PageStatusEnum::Failed, $page->fresh()->status);
|
$this->assertSame(PageStatusEnum::Failed, $page->fresh()->status);
|
||||||
}
|
}
|
||||||
|
|
@ -250,7 +250,7 @@ public function test_handle_updates_page_to_failed_on_blocked_robots(): void
|
||||||
$crawl = PageCrawl::factory()->page($page)->createQuietly();
|
$crawl = PageCrawl::factory()->page($page)->createQuietly();
|
||||||
|
|
||||||
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
|
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
|
||||||
->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class));
|
->handle();
|
||||||
|
|
||||||
$this->assertSame(PageStatusEnum::Failed, $page->fresh()->status);
|
$this->assertSame(PageStatusEnum::Failed, $page->fresh()->status);
|
||||||
}
|
}
|
||||||
|
|
@ -269,7 +269,7 @@ public function test_handle_does_not_register_outbound_links_on_failure(): void
|
||||||
$crawl = PageCrawl::factory()->page($page)->createQuietly();
|
$crawl = PageCrawl::factory()->page($page)->createQuietly();
|
||||||
|
|
||||||
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
|
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
|
||||||
->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class));
|
->handle();
|
||||||
|
|
||||||
$this->assertDatabaseMissing('pages', ['url' => 'https://should-not-be-registered.com/page']);
|
$this->assertDatabaseMissing('pages', ['url' => 'https://should-not-be-registered.com/page']);
|
||||||
$this->assertSame(1, Page::count());
|
$this->assertSame(1, Page::count());
|
||||||
|
|
@ -293,13 +293,80 @@ public function test_handle_registers_outbound_links_on_success(): void
|
||||||
$crawl = PageCrawl::factory()->page($page)->createQuietly();
|
$crawl = PageCrawl::factory()->page($page)->createQuietly();
|
||||||
|
|
||||||
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
|
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
|
||||||
->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class));
|
->handle();
|
||||||
|
|
||||||
$this->assertDatabaseHas('pages', ['url' => 'https://other.com/article-1']);
|
$this->assertDatabaseHas('pages', ['url' => 'https://other.com/article-1']);
|
||||||
$this->assertDatabaseHas('pages', ['url' => 'https://another.com/post-2']);
|
$this->assertDatabaseHas('pages', ['url' => 'https://another.com/post-2']);
|
||||||
$this->assertSame(3, Page::count());
|
$this->assertSame(3, Page::count());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public function test_handle_releases_job_when_domain_is_locked(): void
|
||||||
|
{
|
||||||
|
Queue::fake();
|
||||||
|
|
||||||
|
// Pre-acquire the lock so the job sees it as already held
|
||||||
|
Cache::lock('crawler:domain:example.com', 10)->get();
|
||||||
|
|
||||||
|
// The fetcher must NOT be called — the job should bail before reaching it
|
||||||
|
$fetcher = Mockery::mock(FetchPageAction::class);
|
||||||
|
$fetcher->shouldNotReceive('__invoke');
|
||||||
|
$this->app->instance(FetchPageAction::class, $fetcher);
|
||||||
|
|
||||||
|
$page = Page::factory()->createQuietly(['url' => 'https://example.com/article']);
|
||||||
|
$crawl = PageCrawl::factory()->page($page)->createQuietly();
|
||||||
|
|
||||||
|
$job = new ProcessCrawlJob($crawl);
|
||||||
|
$job->handle();
|
||||||
|
|
||||||
|
// No outcome written — handle() returned early
|
||||||
|
$this->assertNull($crawl->fresh()->outcome);
|
||||||
|
|
||||||
|
// Page status unchanged from its factory default (Discovered)
|
||||||
|
$this->assertSame(PageStatusEnum::Discovered, $page->fresh()->status);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function test_handle_does_not_release_lock_after_completion(): void
|
||||||
|
{
|
||||||
|
Queue::fake();
|
||||||
|
|
||||||
|
$this->mockFetchPageAction(CrawlOutcomeEnum::Success, statusCode: 200);
|
||||||
|
|
||||||
|
$page = Page::factory()->createQuietly(['url' => 'https://example.com/article']);
|
||||||
|
$crawl = PageCrawl::factory()->page($page)->createQuietly();
|
||||||
|
|
||||||
|
$job = new ProcessCrawlJob($crawl);
|
||||||
|
$job->handle();
|
||||||
|
|
||||||
|
// If handle() called $lock->release(), this second get() would succeed (true).
|
||||||
|
// It must fail (false) — the lock acquired inside handle() must still be held.
|
||||||
|
$result = Cache::lock('crawler:domain:example.com', 10)->get();
|
||||||
|
$this->assertFalse($result, 'Expected the domain lock to still be held after handle() completed, but it was free — the lock was released prematurely.');
|
||||||
|
}
|
||||||
|
|
||||||
|
public function test_handle_acquires_domain_lock_before_fetching(): void
|
||||||
|
{
|
||||||
|
Queue::fake();
|
||||||
|
|
||||||
|
$this->mockFetchPageAction(CrawlOutcomeEnum::Success, statusCode: 200);
|
||||||
|
|
||||||
|
$page = Page::factory()->createQuietly(['url' => 'https://lock-test.example.com/article']);
|
||||||
|
$crawl = PageCrawl::factory()->page($page)->createQuietly();
|
||||||
|
|
||||||
|
$domain = $crawl->domain;
|
||||||
|
|
||||||
|
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
|
||||||
|
->handle();
|
||||||
|
|
||||||
|
// The lock must still be held after handle() completes — a second attempt to acquire it fails
|
||||||
|
$this->assertFalse(
|
||||||
|
Cache::lock("crawler:domain:{$domain}", 10)->get(),
|
||||||
|
'Expected the domain lock to still be held after handle() ran, but it was free.',
|
||||||
|
);
|
||||||
|
|
||||||
|
// The fetch ran — outcome was written (proves the lock did not block execution)
|
||||||
|
$this->assertSame(CrawlOutcomeEnum::Success, $crawl->fresh()->outcome);
|
||||||
|
}
|
||||||
|
|
||||||
private function mockFetchPageAction(
|
private function mockFetchPageAction(
|
||||||
CrawlOutcomeEnum $outcome,
|
CrawlOutcomeEnum $outcome,
|
||||||
?int $statusCode = null,
|
?int $statusCode = null,
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue