483 lines
18 KiB
PHP
483 lines
18 KiB
PHP
<?php
|
|
|
|
declare(strict_types=1);
|
|
|
|
namespace Tests\Feature\Jobs;
|
|
|
|
use App\Actions\FetchPageAction;
|
|
use App\Enums\CrawlOutcomeEnum;
|
|
use App\Enums\PageStatusEnum;
|
|
use App\Jobs\ProcessCrawlJob;
|
|
use App\Models\Page;
|
|
use App\Models\PageCrawl;
|
|
use App\ValueObjects\FetchResult;
|
|
use Carbon\Carbon;
|
|
use Illuminate\Foundation\Testing\RefreshDatabase;
|
|
use Illuminate\Support\Collection;
|
|
use Illuminate\Support\Facades\Cache;
|
|
use Illuminate\Support\Facades\Http;
|
|
use Illuminate\Support\Facades\Queue;
|
|
use Mockery;
|
|
use Tests\TestCase;
|
|
|
|
class ProcessCrawlJobTest extends TestCase
|
|
{
|
|
use RefreshDatabase;
|
|
|
|
public function test_creating_a_page_crawl_dispatches_process_crawl_job(): void
|
|
{
|
|
Queue::fake();
|
|
|
|
$page = Page::factory()->createQuietly(['url' => 'https://example.com/article']);
|
|
PageCrawl::factory()->page($page)->create();
|
|
|
|
Queue::assertPushed(ProcessCrawlJob::class);
|
|
}
|
|
|
|
public function test_dispatched_job_carries_the_correct_page_crawl(): void
|
|
{
|
|
Queue::fake();
|
|
|
|
$page = Page::factory()->createQuietly(['url' => 'https://example.com/article']);
|
|
$crawl = PageCrawl::factory()->page($page)->create();
|
|
|
|
Queue::assertPushed(
|
|
ProcessCrawlJob::class,
|
|
fn (ProcessCrawlJob $job) => $job->pageCrawl->id === $crawl->id,
|
|
);
|
|
}
|
|
|
|
public function test_handle_writes_outcome_to_page_crawl_on_success(): void
|
|
{
|
|
Queue::fake();
|
|
|
|
$this->mockFetchPageAction(CrawlOutcomeEnum::Success, statusCode: 200, title: 'Hello', extractedText: 'hi', wordCount: 1);
|
|
|
|
$page = Page::factory()->createQuietly(['url' => 'https://example.com/article']);
|
|
$crawl = PageCrawl::factory()->page($page)->createQuietly();
|
|
|
|
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
|
|
->handle();
|
|
|
|
$fresh = $crawl->fresh();
|
|
$this->assertSame(CrawlOutcomeEnum::Success, $fresh->outcome);
|
|
$this->assertNotNull($fresh->completed_at);
|
|
$this->assertInstanceOf(Carbon::class, $fresh->completed_at);
|
|
$this->assertSame(200, $fresh->status_code);
|
|
$this->assertNull($fresh->error_message);
|
|
}
|
|
|
|
public function test_handle_updates_page_to_fetched_on_success(): void
|
|
{
|
|
Queue::fake();
|
|
|
|
$this->mockFetchPageAction(CrawlOutcomeEnum::Success, statusCode: 200, title: 'Hello', extractedText: 'hi', wordCount: 1);
|
|
|
|
$page = Page::factory()->createQuietly(['url' => 'https://example.com/article']);
|
|
$crawl = PageCrawl::factory()->page($page)->createQuietly();
|
|
|
|
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
|
|
->handle();
|
|
|
|
$fresh = $page->fresh();
|
|
$this->assertSame(PageStatusEnum::Fetched, $fresh->status);
|
|
$this->assertNotNull($fresh->fetched_at);
|
|
$this->assertInstanceOf(Carbon::class, $fresh->fetched_at);
|
|
$this->assertSame('Hello', $fresh->title);
|
|
}
|
|
|
|
public function test_handle_updates_page_to_rejected_on_rejected_outcome(): void
|
|
{
|
|
Queue::fake();
|
|
|
|
$this->mockFetchPageAction(CrawlOutcomeEnum::Rejected, statusCode: 200, errorMessage: 'Unsupported Content-Type: application/pdf');
|
|
|
|
$page = Page::factory()->createQuietly(['url' => 'https://example.com/brochure.pdf']);
|
|
$crawl = PageCrawl::factory()->page($page)->createQuietly();
|
|
|
|
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
|
|
->handle();
|
|
|
|
$fresh = $page->fresh();
|
|
$this->assertSame(PageStatusEnum::Rejected, $fresh->status);
|
|
$this->assertNull($fresh->fetched_at);
|
|
}
|
|
|
|
public function test_handle_updates_page_to_failed_on_blocked_4xx(): void
|
|
{
|
|
Queue::fake();
|
|
|
|
$this->mockFetchPageAction(CrawlOutcomeEnum::Blocked4xx, statusCode: 404, errorMessage: 'HTTP 404');
|
|
|
|
$page = Page::factory()->createQuietly(['url' => 'https://example.com/gone']);
|
|
$crawl = PageCrawl::factory()->page($page)->createQuietly();
|
|
|
|
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
|
|
->handle();
|
|
|
|
$fresh = $page->fresh();
|
|
$this->assertSame(PageStatusEnum::Failed, $fresh->status);
|
|
$this->assertNotNull($fresh->failed_at);
|
|
$this->assertInstanceOf(Carbon::class, $fresh->failed_at);
|
|
}
|
|
|
|
public function test_handle_updates_page_to_failed_on_timeout(): void
|
|
{
|
|
Queue::fake();
|
|
|
|
$this->mockFetchPageAction(CrawlOutcomeEnum::Timeout, errorMessage: 'Connection timed out after 10 seconds');
|
|
|
|
$page = Page::factory()->createQuietly(['url' => 'https://example.com/slow']);
|
|
$crawl = PageCrawl::factory()->page($page)->createQuietly();
|
|
|
|
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
|
|
->handle();
|
|
|
|
$fresh = $page->fresh();
|
|
$this->assertSame(PageStatusEnum::Failed, $fresh->status);
|
|
$this->assertNotNull($fresh->failed_at);
|
|
$this->assertInstanceOf(Carbon::class, $fresh->failed_at);
|
|
}
|
|
|
|
public function test_handle_schedules_retry_on_transient_failure(): void
|
|
{
|
|
Queue::fake();
|
|
|
|
$this->mockFetchPageAction(CrawlOutcomeEnum::Failed, errorMessage: 'Connection refused');
|
|
|
|
$page = Page::factory()->createQuietly(['url' => 'https://example.com/unstable']);
|
|
$crawl = PageCrawl::factory()->page($page)->createQuietly();
|
|
|
|
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
|
|
->handle();
|
|
|
|
// A second PageCrawl row (the retry) must have been inserted for the same page
|
|
$this->assertSame(2, PageCrawl::where('page_id', $page->id)->count());
|
|
|
|
// The new row is pending — outcome IS NULL
|
|
$retryRow = PageCrawl::where('page_id', $page->id)
|
|
->whereNull('outcome')
|
|
->first();
|
|
$this->assertNotNull($retryRow);
|
|
|
|
// A delayed ProcessCrawlJob must have been pushed for the retry row
|
|
Queue::assertPushed(
|
|
ProcessCrawlJob::class,
|
|
fn (ProcessCrawlJob $job) => $job->pageCrawl->page_id === $page->id
|
|
&& $job->pageCrawl->id === $retryRow->id,
|
|
);
|
|
}
|
|
|
|
public function test_handle_does_not_retry_after_three_attempts(): void
|
|
{
|
|
Queue::fake();
|
|
|
|
$this->mockFetchPageAction(CrawlOutcomeEnum::Failed, errorMessage: 'Connection refused');
|
|
|
|
$page = Page::factory()->createQuietly(['url' => 'https://example.com/unreachable']);
|
|
|
|
// 3 prior attempts already exist — this is the cap
|
|
PageCrawl::factory()->page($page)->failed('Connection refused')->createQuietly();
|
|
PageCrawl::factory()->page($page)->failed('Connection refused')->createQuietly();
|
|
$thirdCrawl = PageCrawl::factory()->page($page)->createQuietly();
|
|
|
|
app(ProcessCrawlJob::class, ['pageCrawl' => $thirdCrawl])
|
|
->handle();
|
|
|
|
// No 4th row must appear — retry cap reached
|
|
$this->assertSame(3, PageCrawl::where('page_id', $page->id)->count());
|
|
|
|
// No retry job dispatched
|
|
Queue::assertNotPushed(ProcessCrawlJob::class);
|
|
}
|
|
|
|
public function test_handle_writes_failed_outcome_to_page_crawl(): void
|
|
{
|
|
Queue::fake();
|
|
|
|
$this->mockFetchPageAction(CrawlOutcomeEnum::Failed, errorMessage: 'boom');
|
|
|
|
$page = Page::factory()->createQuietly(['url' => 'https://example.com/unstable']);
|
|
$crawl = PageCrawl::factory()->page($page)->createQuietly();
|
|
|
|
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
|
|
->handle();
|
|
|
|
$this->assertDatabaseHas('page_crawls', [
|
|
'id' => $crawl->id,
|
|
'outcome' => CrawlOutcomeEnum::Failed->value,
|
|
'status_code' => null,
|
|
'error_message' => 'boom',
|
|
]);
|
|
}
|
|
|
|
public function test_handle_updates_page_to_failed_on_failed_outcome(): void
|
|
{
|
|
Queue::fake();
|
|
|
|
$this->mockFetchPageAction(CrawlOutcomeEnum::Failed, errorMessage: 'Connection refused');
|
|
|
|
$page = Page::factory()->createQuietly(['url' => 'https://example.com/unreachable']);
|
|
$crawl = PageCrawl::factory()->page($page)->createQuietly();
|
|
|
|
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
|
|
->handle();
|
|
|
|
$this->assertSame(PageStatusEnum::Failed, $page->fresh()->status);
|
|
}
|
|
|
|
public function test_handle_updates_page_to_failed_on_blocked_5xx(): void
|
|
{
|
|
Queue::fake();
|
|
|
|
$this->mockFetchPageAction(CrawlOutcomeEnum::Blocked5xx, statusCode: 503, errorMessage: 'HTTP 503');
|
|
|
|
$page = Page::factory()->createQuietly(['url' => 'https://example.com/overloaded']);
|
|
$crawl = PageCrawl::factory()->page($page)->createQuietly();
|
|
|
|
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
|
|
->handle();
|
|
|
|
$this->assertSame(PageStatusEnum::Failed, $page->fresh()->status);
|
|
}
|
|
|
|
public function test_handle_updates_page_to_failed_on_blocked_robots(): void
|
|
{
|
|
Queue::fake();
|
|
|
|
$this->mockFetchPageAction(CrawlOutcomeEnum::BlockedRobots, errorMessage: 'Disallowed by robots.txt');
|
|
|
|
$page = Page::factory()->createQuietly(['url' => 'https://example.com/private']);
|
|
$crawl = PageCrawl::factory()->page($page)->createQuietly();
|
|
|
|
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
|
|
->handle();
|
|
|
|
$this->assertSame(PageStatusEnum::Failed, $page->fresh()->status);
|
|
}
|
|
|
|
public function test_handle_does_not_register_outbound_links_on_failure(): void
|
|
{
|
|
Queue::fake();
|
|
|
|
$this->mockFetchPageAction(
|
|
CrawlOutcomeEnum::Failed,
|
|
outboundLinks: collect(['https://should-not-be-registered.com/page']),
|
|
errorMessage: 'Connection refused',
|
|
);
|
|
|
|
$page = Page::factory()->createQuietly(['url' => 'https://example.com/broken']);
|
|
$crawl = PageCrawl::factory()->page($page)->createQuietly();
|
|
|
|
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
|
|
->handle();
|
|
|
|
$this->assertDatabaseMissing('pages', ['url' => 'https://should-not-be-registered.com/page']);
|
|
$this->assertSame(1, Page::count());
|
|
}
|
|
|
|
public function test_handle_registers_outbound_links_on_success(): void
|
|
{
|
|
Queue::fake();
|
|
|
|
$this->mockFetchPageAction(
|
|
CrawlOutcomeEnum::Success,
|
|
statusCode: 200,
|
|
finalUrl: 'https://source.com/article',
|
|
title: 'Source Article',
|
|
extractedText: 'some text',
|
|
outboundLinks: collect(['https://other.com/article-1', 'https://another.com/post-2']),
|
|
wordCount: 2,
|
|
);
|
|
|
|
$page = Page::factory()->createQuietly(['url' => 'https://source.com/article']);
|
|
$crawl = PageCrawl::factory()->page($page)->createQuietly();
|
|
|
|
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
|
|
->handle();
|
|
|
|
$this->assertDatabaseHas('pages', ['url' => 'https://other.com/article-1']);
|
|
$this->assertDatabaseHas('pages', ['url' => 'https://another.com/post-2']);
|
|
$this->assertSame(3, Page::count());
|
|
}
|
|
|
|
public function test_handle_releases_job_when_domain_is_locked(): void
|
|
{
|
|
Queue::fake();
|
|
|
|
// Pre-acquire the lock so the job sees it as already held
|
|
Cache::lock('crawler:domain:example.com', 10)->get();
|
|
|
|
// The fetcher must NOT be called — the job should bail before reaching it
|
|
$fetcher = Mockery::mock(FetchPageAction::class);
|
|
$fetcher->shouldNotReceive('__invoke');
|
|
$this->app->instance(FetchPageAction::class, $fetcher);
|
|
|
|
$page = Page::factory()->createQuietly(['url' => 'https://example.com/article']);
|
|
$crawl = PageCrawl::factory()->page($page)->createQuietly();
|
|
|
|
$job = new ProcessCrawlJob($crawl);
|
|
$job->handle();
|
|
|
|
// No outcome written — handle() returned early
|
|
$this->assertNull($crawl->fresh()->outcome);
|
|
|
|
// Page status unchanged from its factory default (Discovered)
|
|
$this->assertSame(PageStatusEnum::Discovered, $page->fresh()->status);
|
|
}
|
|
|
|
public function test_handle_does_not_release_lock_after_completion(): void
|
|
{
|
|
Queue::fake();
|
|
|
|
$this->mockFetchPageAction(CrawlOutcomeEnum::Success, statusCode: 200);
|
|
|
|
$page = Page::factory()->createQuietly(['url' => 'https://example.com/article']);
|
|
$crawl = PageCrawl::factory()->page($page)->createQuietly();
|
|
|
|
$job = new ProcessCrawlJob($crawl);
|
|
$job->handle();
|
|
|
|
// If handle() called $lock->release(), this second get() would succeed (true).
|
|
// It must fail (false) — the lock acquired inside handle() must still be held.
|
|
$result = Cache::lock('crawler:domain:example.com', 10)->get();
|
|
$this->assertFalse($result, 'Expected the domain lock to still be held after handle() completed, but it was free — the lock was released prematurely.');
|
|
}
|
|
|
|
public function test_handle_writes_blocked_robots_when_disallowed(): void
|
|
{
|
|
Queue::fake();
|
|
|
|
Http::fake([
|
|
'https://example.com/robots.txt' => Http::response(
|
|
"User-agent: *\nDisallow: /",
|
|
200,
|
|
),
|
|
]);
|
|
|
|
// FetchPageAction must never be called — the robots gate returns before the lock
|
|
$fetcher = Mockery::mock(FetchPageAction::class);
|
|
$fetcher->shouldNotReceive('__invoke');
|
|
$this->app->instance(FetchPageAction::class, $fetcher);
|
|
|
|
$page = Page::factory()->createQuietly(['url' => 'https://example.com/private']);
|
|
$crawl = PageCrawl::factory()->page($page)->createQuietly();
|
|
|
|
$domain = $crawl->domain;
|
|
|
|
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
|
|
->handle();
|
|
|
|
// Outcome row must record BlockedRobots
|
|
$this->assertDatabaseHas('page_crawls', [
|
|
'id' => $crawl->id,
|
|
'outcome' => CrawlOutcomeEnum::BlockedRobots->value,
|
|
]);
|
|
|
|
// Page status must be Failed (BlockedRobots::toPageStatus() === Failed)
|
|
$this->assertSame(PageStatusEnum::Failed, $page->fresh()->status);
|
|
|
|
// The politeness lock must still be acquirable — the gate returned before ever claiming it
|
|
$this->assertTrue(
|
|
Cache::lock("crawler:domain:{$domain}", 10)->get(),
|
|
'Expected the politeness lock to be free (gate returned before acquiring it), but it was already held.',
|
|
);
|
|
}
|
|
|
|
public function test_handle_acquires_domain_lock_before_fetching(): void
|
|
{
|
|
Queue::fake();
|
|
|
|
$this->mockFetchPageAction(CrawlOutcomeEnum::Success, statusCode: 200);
|
|
|
|
$page = Page::factory()->createQuietly(['url' => 'https://lock-test.example.com/article']);
|
|
$crawl = PageCrawl::factory()->page($page)->createQuietly();
|
|
|
|
$domain = $crawl->domain;
|
|
|
|
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
|
|
->handle();
|
|
|
|
// The lock must still be held after handle() completes — a second attempt to acquire it fails
|
|
$this->assertFalse(
|
|
Cache::lock("crawler:domain:{$domain}", 10)->get(),
|
|
'Expected the domain lock to still be held after handle() ran, but it was free.',
|
|
);
|
|
|
|
// The fetch ran — outcome was written (proves the lock did not block execution)
|
|
$this->assertSame(CrawlOutcomeEnum::Success, $crawl->fresh()->outcome);
|
|
}
|
|
|
|
public function test_handle_proceeds_through_politeness_lock_when_robots_allow(): void
|
|
{
|
|
Queue::fake();
|
|
|
|
Http::fake([
|
|
'https://example.com/robots.txt' => Http::response(
|
|
"User-agent: *\nAllow: /",
|
|
200,
|
|
),
|
|
]);
|
|
|
|
// FetchPageAction must be called exactly once — robots gate passed, fetch proceeds
|
|
$fetcher = Mockery::mock(FetchPageAction::class);
|
|
$fetcher->shouldReceive('__invoke')->once()->andReturn(new FetchResult(
|
|
outcome: CrawlOutcomeEnum::Success,
|
|
statusCode: 200,
|
|
finalUrl: 'https://example.com/article',
|
|
title: 'Hello',
|
|
extractedText: 'hi',
|
|
outboundLinks: collect(),
|
|
wordCount: 1,
|
|
errorMessage: null,
|
|
));
|
|
$this->app->instance(FetchPageAction::class, $fetcher);
|
|
|
|
$page = Page::factory()->createQuietly(['url' => 'https://example.com/article']);
|
|
$crawl = PageCrawl::factory()->page($page)->createQuietly();
|
|
|
|
$domain = $crawl->domain;
|
|
|
|
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
|
|
->handle();
|
|
|
|
// Outcome must be Success — not BlockedRobots
|
|
$this->assertDatabaseHas('page_crawls', [
|
|
'id' => $crawl->id,
|
|
'outcome' => CrawlOutcomeEnum::Success->value,
|
|
]);
|
|
|
|
// Page status must have advanced to Fetched
|
|
$this->assertSame(PageStatusEnum::Fetched, $page->fresh()->status);
|
|
|
|
// Politeness lock must still be held (claimed during the fetch, never released)
|
|
$this->assertFalse(
|
|
Cache::lock("crawler:domain:{$domain}", 10)->get(),
|
|
'Expected the politeness lock to be held after a successful fetch, but it was free.',
|
|
);
|
|
}
|
|
|
|
private function mockFetchPageAction(
|
|
CrawlOutcomeEnum $outcome,
|
|
?int $statusCode = null,
|
|
?string $finalUrl = 'https://example.com/article',
|
|
?string $title = null,
|
|
?string $extractedText = null,
|
|
?Collection $outboundLinks = null,
|
|
?int $wordCount = null,
|
|
?string $errorMessage = null,
|
|
): void {
|
|
$fetcher = Mockery::mock(FetchPageAction::class);
|
|
$fetcher->shouldReceive('__invoke')->andReturn(new FetchResult(
|
|
outcome: $outcome,
|
|
statusCode: $statusCode,
|
|
finalUrl: $finalUrl,
|
|
title: $title,
|
|
extractedText: $extractedText,
|
|
outboundLinks: $outboundLinks ?? collect(),
|
|
wordCount: $wordCount,
|
|
errorMessage: $errorMessage,
|
|
));
|
|
$this->app->instance(FetchPageAction::class, $fetcher);
|
|
}
|
|
}
|