trove/tests/Feature/Jobs/ProcessCrawlJobTest.php

437 lines
16 KiB
PHP

<?php
declare(strict_types=1);
namespace Tests\Feature\Jobs;
use App\Actions\FetchPageAction;
use App\Actions\RegisterDiscoveredPageAction;
use App\Enums\CrawlOutcomeEnum;
use App\Enums\PageStatusEnum;
use App\Jobs\ProcessCrawlJob;
use App\Models\Page;
use App\Models\PageCrawl;
use App\ValueObjects\FetchResult;
use Carbon\Carbon;
use Illuminate\Foundation\Testing\RefreshDatabase;
use Illuminate\Support\Facades\Queue;
use Mockery;
use Tests\TestCase;
class ProcessCrawlJobTest extends TestCase
{
use RefreshDatabase;
public function test_creating_a_page_crawl_dispatches_process_crawl_job(): void
{
Queue::fake();
$page = Page::factory()->createQuietly(['url' => 'https://example.com/article']);
PageCrawl::factory()->page($page)->create();
Queue::assertPushed(ProcessCrawlJob::class);
}
public function test_dispatched_job_carries_the_correct_page_crawl(): void
{
Queue::fake();
$page = Page::factory()->createQuietly(['url' => 'https://example.com/article']);
$crawl = PageCrawl::factory()->page($page)->create();
Queue::assertPushed(
ProcessCrawlJob::class,
fn (ProcessCrawlJob $job) => $job->pageCrawl->id === $crawl->id,
);
}
public function test_handle_writes_outcome_to_page_crawl_on_success(): void
{
Queue::fake();
$fetcher = Mockery::mock(FetchPageAction::class);
$fetcher->shouldReceive('__invoke')->andReturn(new FetchResult(
outcome: CrawlOutcomeEnum::Success,
statusCode: 200,
finalUrl: 'https://example.com/article',
title: 'Hello',
extractedText: 'hi',
outboundLinks: collect(),
wordCount: 1,
errorMessage: null,
));
$this->app->instance(FetchPageAction::class, $fetcher);
$page = Page::factory()->createQuietly(['url' => 'https://example.com/article']);
$crawl = PageCrawl::factory()->page($page)->createQuietly();
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class));
$fresh = $crawl->fresh();
$this->assertSame(CrawlOutcomeEnum::Success, $fresh->outcome);
$this->assertNotNull($fresh->completed_at);
$this->assertInstanceOf(Carbon::class, $fresh->completed_at);
$this->assertSame(200, $fresh->status_code);
$this->assertNull($fresh->error_message);
}
public function test_handle_updates_page_to_fetched_on_success(): void
{
Queue::fake();
$fetcher = Mockery::mock(FetchPageAction::class);
$fetcher->shouldReceive('__invoke')->andReturn(new FetchResult(
outcome: CrawlOutcomeEnum::Success,
statusCode: 200,
finalUrl: 'https://example.com/article',
title: 'Hello',
extractedText: 'hi',
outboundLinks: collect(),
wordCount: 1,
errorMessage: null,
));
$this->app->instance(FetchPageAction::class, $fetcher);
$page = Page::factory()->createQuietly(['url' => 'https://example.com/article']);
$crawl = PageCrawl::factory()->page($page)->createQuietly();
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class));
$fresh = $page->fresh();
$this->assertSame(PageStatusEnum::Fetched, $fresh->status);
$this->assertNotNull($fresh->fetched_at);
$this->assertInstanceOf(Carbon::class, $fresh->fetched_at);
$this->assertSame('Hello', $fresh->title);
}
public function test_handle_updates_page_to_rejected_on_rejected_outcome(): void
{
Queue::fake();
$fetcher = Mockery::mock(FetchPageAction::class);
$fetcher->shouldReceive('__invoke')->andReturn(new FetchResult(
outcome: CrawlOutcomeEnum::Rejected,
statusCode: 200,
finalUrl: null,
title: null,
extractedText: null,
outboundLinks: collect(),
wordCount: null,
errorMessage: 'Unsupported Content-Type: application/pdf',
));
$this->app->instance(FetchPageAction::class, $fetcher);
$page = Page::factory()->createQuietly(['url' => 'https://example.com/brochure.pdf']);
$crawl = PageCrawl::factory()->page($page)->createQuietly();
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class));
$fresh = $page->fresh();
$this->assertSame(PageStatusEnum::Rejected, $fresh->status);
$this->assertNull($fresh->fetched_at);
}
public function test_handle_updates_page_to_failed_on_blocked_4xx(): void
{
Queue::fake();
$fetcher = Mockery::mock(FetchPageAction::class);
$fetcher->shouldReceive('__invoke')->andReturn(new FetchResult(
outcome: CrawlOutcomeEnum::Blocked4xx,
statusCode: 404,
finalUrl: null,
title: null,
extractedText: null,
outboundLinks: collect(),
wordCount: null,
errorMessage: 'HTTP 404',
));
$this->app->instance(FetchPageAction::class, $fetcher);
$page = Page::factory()->createQuietly(['url' => 'https://example.com/gone']);
$crawl = PageCrawl::factory()->page($page)->createQuietly();
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class));
$fresh = $page->fresh();
$this->assertSame(PageStatusEnum::Failed, $fresh->status);
$this->assertNotNull($fresh->failed_at);
$this->assertInstanceOf(Carbon::class, $fresh->failed_at);
}
public function test_handle_updates_page_to_failed_on_timeout(): void
{
Queue::fake();
$fetcher = Mockery::mock(FetchPageAction::class);
$fetcher->shouldReceive('__invoke')->andReturn(new FetchResult(
outcome: CrawlOutcomeEnum::Timeout,
statusCode: null,
finalUrl: null,
title: null,
extractedText: null,
outboundLinks: collect(),
wordCount: null,
errorMessage: 'Connection timed out after 10 seconds',
));
$this->app->instance(FetchPageAction::class, $fetcher);
$page = Page::factory()->createQuietly(['url' => 'https://example.com/slow']);
$crawl = PageCrawl::factory()->page($page)->createQuietly();
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class));
$fresh = $page->fresh();
$this->assertSame(PageStatusEnum::Failed, $fresh->status);
$this->assertNotNull($fresh->failed_at);
$this->assertInstanceOf(Carbon::class, $fresh->failed_at);
}
public function test_handle_schedules_retry_on_transient_failure(): void
{
Queue::fake();
$fetcher = Mockery::mock(FetchPageAction::class);
$fetcher->shouldReceive('__invoke')->andReturn(new FetchResult(
outcome: CrawlOutcomeEnum::Failed,
statusCode: null,
finalUrl: null,
title: null,
extractedText: null,
outboundLinks: collect(),
wordCount: null,
errorMessage: 'Connection refused',
));
$this->app->instance(FetchPageAction::class, $fetcher);
$page = Page::factory()->createQuietly(['url' => 'https://example.com/unstable']);
$crawl = PageCrawl::factory()->page($page)->createQuietly();
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class));
// A second PageCrawl row (the retry) must have been inserted for the same page
$this->assertSame(2, PageCrawl::where('page_id', $page->id)->count());
// The new row is pending — outcome IS NULL
$retryRow = PageCrawl::where('page_id', $page->id)
->whereNull('outcome')
->first();
$this->assertNotNull($retryRow);
// A delayed ProcessCrawlJob must have been pushed for the retry row
Queue::assertPushed(
ProcessCrawlJob::class,
fn (ProcessCrawlJob $job) => $job->pageCrawl->page_id === $page->id
&& $job->pageCrawl->id === $retryRow->id,
);
}
public function test_handle_does_not_retry_after_three_attempts(): void
{
Queue::fake();
$fetcher = Mockery::mock(FetchPageAction::class);
$fetcher->shouldReceive('__invoke')->andReturn(new FetchResult(
outcome: CrawlOutcomeEnum::Failed,
statusCode: null,
finalUrl: null,
title: null,
extractedText: null,
outboundLinks: collect(),
wordCount: null,
errorMessage: 'Connection refused',
));
$this->app->instance(FetchPageAction::class, $fetcher);
$page = Page::factory()->createQuietly(['url' => 'https://example.com/unreachable']);
// 3 prior attempts already exist — this is the cap
PageCrawl::factory()->page($page)->failed('Connection refused')->createQuietly();
PageCrawl::factory()->page($page)->failed('Connection refused')->createQuietly();
$thirdCrawl = PageCrawl::factory()->page($page)->createQuietly();
app(ProcessCrawlJob::class, ['pageCrawl' => $thirdCrawl])
->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class));
// No 4th row must appear — retry cap reached
$this->assertSame(3, PageCrawl::where('page_id', $page->id)->count());
// No retry job dispatched
Queue::assertNotPushed(ProcessCrawlJob::class);
}
public function test_handle_writes_failed_outcome_to_page_crawl(): void
{
Queue::fake();
$fetcher = Mockery::mock(FetchPageAction::class);
$fetcher->shouldReceive('__invoke')->andReturn(new FetchResult(
outcome: CrawlOutcomeEnum::Failed,
statusCode: null,
finalUrl: null,
title: null,
extractedText: null,
outboundLinks: collect(),
wordCount: null,
errorMessage: 'boom',
));
$this->app->instance(FetchPageAction::class, $fetcher);
$page = Page::factory()->createQuietly(['url' => 'https://example.com/unstable']);
$crawl = PageCrawl::factory()->page($page)->createQuietly();
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class));
$this->assertDatabaseHas('page_crawls', [
'id' => $crawl->id,
'outcome' => CrawlOutcomeEnum::Failed->value,
'status_code' => null,
'error_message' => 'boom',
]);
}
public function test_handle_updates_page_to_failed_on_failed_outcome(): void
{
Queue::fake();
$fetcher = Mockery::mock(FetchPageAction::class);
$fetcher->shouldReceive('__invoke')->andReturn(new FetchResult(
outcome: CrawlOutcomeEnum::Failed,
statusCode: null,
finalUrl: null,
title: null,
extractedText: null,
outboundLinks: collect(),
wordCount: null,
errorMessage: 'Connection refused',
));
$this->app->instance(FetchPageAction::class, $fetcher);
$page = Page::factory()->createQuietly(['url' => 'https://example.com/unreachable']);
$crawl = PageCrawl::factory()->page($page)->createQuietly();
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class));
$this->assertSame(PageStatusEnum::Failed, $page->fresh()->status);
}
public function test_handle_updates_page_to_failed_on_blocked_5xx(): void
{
Queue::fake();
$fetcher = Mockery::mock(FetchPageAction::class);
$fetcher->shouldReceive('__invoke')->andReturn(new FetchResult(
outcome: CrawlOutcomeEnum::Blocked5xx,
statusCode: 503,
finalUrl: null,
title: null,
extractedText: null,
outboundLinks: collect(),
wordCount: null,
errorMessage: 'HTTP 503',
));
$this->app->instance(FetchPageAction::class, $fetcher);
$page = Page::factory()->createQuietly(['url' => 'https://example.com/overloaded']);
$crawl = PageCrawl::factory()->page($page)->createQuietly();
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class));
$this->assertSame(PageStatusEnum::Failed, $page->fresh()->status);
}
public function test_handle_updates_page_to_failed_on_blocked_robots(): void
{
Queue::fake();
$fetcher = Mockery::mock(FetchPageAction::class);
$fetcher->shouldReceive('__invoke')->andReturn(new FetchResult(
outcome: CrawlOutcomeEnum::BlockedRobots,
statusCode: null,
finalUrl: null,
title: null,
extractedText: null,
outboundLinks: collect(),
wordCount: null,
errorMessage: 'Disallowed by robots.txt',
));
$this->app->instance(FetchPageAction::class, $fetcher);
$page = Page::factory()->createQuietly(['url' => 'https://example.com/private']);
$crawl = PageCrawl::factory()->page($page)->createQuietly();
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class));
$this->assertSame(PageStatusEnum::Failed, $page->fresh()->status);
}
public function test_handle_does_not_register_outbound_links_on_failure(): void
{
Queue::fake();
$fetcher = Mockery::mock(FetchPageAction::class);
$fetcher->shouldReceive('__invoke')->andReturn(new FetchResult(
outcome: CrawlOutcomeEnum::Failed,
statusCode: null,
finalUrl: null,
title: null,
extractedText: null,
outboundLinks: collect([
'https://should-not-be-registered.com/page',
]),
wordCount: null,
errorMessage: 'Connection refused',
));
$this->app->instance(FetchPageAction::class, $fetcher);
$page = Page::factory()->createQuietly(['url' => 'https://example.com/broken']);
$crawl = PageCrawl::factory()->page($page)->createQuietly();
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class));
$this->assertDatabaseMissing('pages', ['url' => 'https://should-not-be-registered.com/page']);
$this->assertSame(1, Page::count());
}
public function test_handle_registers_outbound_links_on_success(): void
{
Queue::fake();
$fetcher = Mockery::mock(FetchPageAction::class);
$fetcher->shouldReceive('__invoke')->andReturn(new FetchResult(
outcome: CrawlOutcomeEnum::Success,
statusCode: 200,
finalUrl: 'https://source.com/article',
title: 'Source Article',
extractedText: 'some text',
outboundLinks: collect([
'https://other.com/article-1',
'https://another.com/post-2',
]),
wordCount: 2,
errorMessage: null,
));
$this->app->instance(FetchPageAction::class, $fetcher);
$page = Page::factory()->createQuietly(['url' => 'https://source.com/article']);
$crawl = PageCrawl::factory()->page($page)->createQuietly();
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class));
$this->assertDatabaseHas('pages', ['url' => 'https://other.com/article-1']);
$this->assertDatabaseHas('pages', ['url' => 'https://another.com/post-2']);
$this->assertSame(3, Page::count());
}
}