14 - Implement ProcessCrawlJob orchestration with retry logic

This commit is contained in:
myrmidex 2026-04-26 23:50:57 +02:00
parent 2a586ecac4
commit 720e4bcc1f
7 changed files with 344 additions and 12 deletions

View file

@ -2,7 +2,12 @@
namespace App\Jobs;
use App\Actions\FetchPageAction;
use App\Actions\RegisterDiscoveredPageAction;
use App\Enums\CrawlOutcomeEnum;
use App\Enums\PageStatusEnum;
use App\Models\PageCrawl;
use App\ValueObjects\FetchResult;
use Illuminate\Contracts\Queue\ShouldQueue;
use Illuminate\Foundation\Queue\Queueable;
@ -14,8 +19,66 @@ public function __construct(
public PageCrawl $pageCrawl,
) {}
public function handle(): void
public function handle(
FetchPageAction $fetcher,
RegisterDiscoveredPageAction $register,
): void {
/** @var FetchResult $result */
$result = $fetcher($this->pageCrawl->page->url);
$this->pageCrawl->update([
'outcome' => CrawlOutcomeEnum::Success,
'completed_at' => now(),
'status_code' => 200,
]);
$update = match ($result->outcome) {
CrawlOutcomeEnum::Rejected => [
'status' => PageStatusEnum::Rejected,
'fetched_at' => null,
],
CrawlOutcomeEnum::Timeout => [
'status' => PageStatusEnum::Failed,
'failed_at' => now(),
],
CrawlOutcomeEnum::Blocked4xx => [
'status' => PageStatusEnum::Failed,
'failed_at' => now(),
],
default => [
'status' => PageStatusEnum::Fetched,
'fetched_at' => now(),
'title' => $result->title,
],
};
$result->outboundLinks->each(fn (string $url) => $register($url));
$this->pageCrawl->page->update($update);
if (in_array($result->outcome, [
CrawlOutcomeEnum::Failed,
CrawlOutcomeEnum::Timeout,
CrawlOutcomeEnum::Blocked5xx,
])) {
$this->scheduleRetryIfNeeded($result, $this->pageCrawl);
}
}
private function scheduleRetryIfNeeded(FetchResult $result, PageCrawl $crawl): void
{
//
if (PageCrawl::where('page_id', $crawl->page_id)->count() >= 3) {
return;
}
$newRow = PageCrawl::withoutEvents(
fn () => PageCrawl::create(
array_merge($crawl->toArray(), [
'outcome' => null,
])
)
);
ProcessCrawlJob::dispatch($newRow)->delay(now()->addHour());
}
}

View file

@ -35,6 +35,9 @@ class PageCrawl extends Model
'status_code' => 'integer',
];
/**
* @return BelongsTo<Page, $this>
*/
public function page(): BelongsTo
{
return $this->belongsTo(Page::class);

View file

@ -4,11 +4,18 @@
namespace Tests\Feature\Jobs;
use App\Actions\FetchPageAction;
use App\Actions\RegisterDiscoveredPageAction;
use App\Enums\CrawlOutcomeEnum;
use App\Enums\PageStatusEnum;
use App\Jobs\ProcessCrawlJob;
use App\Models\Page;
use App\Models\PageCrawl;
use App\ValueObjects\FetchResult;
use Carbon\Carbon;
use Illuminate\Foundation\Testing\RefreshDatabase;
use Illuminate\Support\Facades\Queue;
use Mockery;
use Tests\TestCase;
class ProcessCrawlJobTest extends TestCase
@ -37,4 +44,256 @@ public function test_dispatched_job_carries_the_correct_page_crawl(): void
fn (ProcessCrawlJob $job) => $job->pageCrawl->id === $crawl->id,
);
}
public function test_handle_writes_outcome_to_page_crawl_on_success(): void
{
Queue::fake();
$fetcher = Mockery::mock(FetchPageAction::class);
$fetcher->shouldReceive('__invoke')->andReturn(new FetchResult(
outcome: CrawlOutcomeEnum::Success,
statusCode: 200,
finalUrl: 'https://example.com/article',
title: 'Hello',
extractedText: 'hi',
outboundLinks: collect(),
wordCount: 1,
errorMessage: null,
));
$this->app->instance(FetchPageAction::class, $fetcher);
$page = Page::factory()->createQuietly(['url' => 'https://example.com/article']);
$crawl = PageCrawl::factory()->page($page)->createQuietly();
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class));
$fresh = $crawl->fresh();
$this->assertSame(CrawlOutcomeEnum::Success, $fresh->outcome);
$this->assertNotNull($fresh->completed_at);
$this->assertInstanceOf(Carbon::class, $fresh->completed_at);
$this->assertSame(200, $fresh->status_code);
$this->assertNull($fresh->error_message);
}
public function test_handle_updates_page_to_fetched_on_success(): void
{
Queue::fake();
$fetcher = Mockery::mock(FetchPageAction::class);
$fetcher->shouldReceive('__invoke')->andReturn(new FetchResult(
outcome: CrawlOutcomeEnum::Success,
statusCode: 200,
finalUrl: 'https://example.com/article',
title: 'Hello',
extractedText: 'hi',
outboundLinks: collect(),
wordCount: 1,
errorMessage: null,
));
$this->app->instance(FetchPageAction::class, $fetcher);
$page = Page::factory()->createQuietly(['url' => 'https://example.com/article']);
$crawl = PageCrawl::factory()->page($page)->createQuietly();
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class));
$fresh = $page->fresh();
$this->assertSame(PageStatusEnum::Fetched, $fresh->status);
$this->assertNotNull($fresh->fetched_at);
$this->assertInstanceOf(Carbon::class, $fresh->fetched_at);
$this->assertSame('Hello', $fresh->title);
}
public function test_handle_updates_page_to_rejected_on_rejected_outcome(): void
{
Queue::fake();
$fetcher = Mockery::mock(FetchPageAction::class);
$fetcher->shouldReceive('__invoke')->andReturn(new FetchResult(
outcome: CrawlOutcomeEnum::Rejected,
statusCode: 200,
finalUrl: null,
title: null,
extractedText: null,
outboundLinks: collect(),
wordCount: null,
errorMessage: 'Unsupported Content-Type: application/pdf',
));
$this->app->instance(FetchPageAction::class, $fetcher);
$page = Page::factory()->createQuietly(['url' => 'https://example.com/brochure.pdf']);
$crawl = PageCrawl::factory()->page($page)->createQuietly();
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class));
$fresh = $page->fresh();
$this->assertSame(PageStatusEnum::Rejected, $fresh->status);
$this->assertNull($fresh->fetched_at);
}
public function test_handle_updates_page_to_failed_on_blocked_4xx(): void
{
Queue::fake();
$fetcher = Mockery::mock(FetchPageAction::class);
$fetcher->shouldReceive('__invoke')->andReturn(new FetchResult(
outcome: CrawlOutcomeEnum::Blocked4xx,
statusCode: 404,
finalUrl: null,
title: null,
extractedText: null,
outboundLinks: collect(),
wordCount: null,
errorMessage: 'HTTP 404',
));
$this->app->instance(FetchPageAction::class, $fetcher);
$page = Page::factory()->createQuietly(['url' => 'https://example.com/gone']);
$crawl = PageCrawl::factory()->page($page)->createQuietly();
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class));
$fresh = $page->fresh();
$this->assertSame(PageStatusEnum::Failed, $fresh->status);
$this->assertNotNull($fresh->failed_at);
$this->assertInstanceOf(Carbon::class, $fresh->failed_at);
}
public function test_handle_updates_page_to_failed_on_timeout(): void
{
Queue::fake();
$fetcher = Mockery::mock(FetchPageAction::class);
$fetcher->shouldReceive('__invoke')->andReturn(new FetchResult(
outcome: CrawlOutcomeEnum::Timeout,
statusCode: null,
finalUrl: null,
title: null,
extractedText: null,
outboundLinks: collect(),
wordCount: null,
errorMessage: 'Connection timed out after 10 seconds',
));
$this->app->instance(FetchPageAction::class, $fetcher);
$page = Page::factory()->createQuietly(['url' => 'https://example.com/slow']);
$crawl = PageCrawl::factory()->page($page)->createQuietly();
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class));
$fresh = $page->fresh();
$this->assertSame(PageStatusEnum::Failed, $fresh->status);
$this->assertNotNull($fresh->failed_at);
$this->assertInstanceOf(Carbon::class, $fresh->failed_at);
}
public function test_handle_schedules_retry_on_transient_failure(): void
{
Queue::fake();
$fetcher = Mockery::mock(FetchPageAction::class);
$fetcher->shouldReceive('__invoke')->andReturn(new FetchResult(
outcome: CrawlOutcomeEnum::Failed,
statusCode: null,
finalUrl: null,
title: null,
extractedText: null,
outboundLinks: collect(),
wordCount: null,
errorMessage: 'Connection refused',
));
$this->app->instance(FetchPageAction::class, $fetcher);
$page = Page::factory()->createQuietly(['url' => 'https://example.com/unstable']);
$crawl = PageCrawl::factory()->page($page)->createQuietly();
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class));
// A second PageCrawl row (the retry) must have been inserted for the same page
$this->assertSame(2, PageCrawl::where('page_id', $page->id)->count());
// The new row is pending — outcome IS NULL
$retryRow = PageCrawl::where('page_id', $page->id)
->whereNull('outcome')
->first();
$this->assertNotNull($retryRow);
// A delayed ProcessCrawlJob must have been pushed for the retry row
Queue::assertPushed(
ProcessCrawlJob::class,
fn (ProcessCrawlJob $job) => $job->pageCrawl->page_id === $page->id
&& $job->pageCrawl->id === $retryRow->id,
);
}
public function test_handle_does_not_retry_after_three_attempts(): void
{
Queue::fake();
$fetcher = Mockery::mock(FetchPageAction::class);
$fetcher->shouldReceive('__invoke')->andReturn(new FetchResult(
outcome: CrawlOutcomeEnum::Failed,
statusCode: null,
finalUrl: null,
title: null,
extractedText: null,
outboundLinks: collect(),
wordCount: null,
errorMessage: 'Connection refused',
));
$this->app->instance(FetchPageAction::class, $fetcher);
$page = Page::factory()->createQuietly(['url' => 'https://example.com/unreachable']);
// 3 prior attempts already exist — this is the cap
PageCrawl::factory()->page($page)->failed('Connection refused')->createQuietly();
PageCrawl::factory()->page($page)->failed('Connection refused')->createQuietly();
$thirdCrawl = PageCrawl::factory()->page($page)->createQuietly();
app(ProcessCrawlJob::class, ['pageCrawl' => $thirdCrawl])
->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class));
// No 4th row must appear — retry cap reached
$this->assertSame(3, PageCrawl::where('page_id', $page->id)->count());
// No retry job dispatched
Queue::assertNotPushed(ProcessCrawlJob::class);
}
public function test_handle_registers_outbound_links_on_success(): void
{
Queue::fake();
$fetcher = Mockery::mock(FetchPageAction::class);
$fetcher->shouldReceive('__invoke')->andReturn(new FetchResult(
outcome: CrawlOutcomeEnum::Success,
statusCode: 200,
finalUrl: 'https://source.com/article',
title: 'Source Article',
extractedText: 'some text',
outboundLinks: collect([
'https://other.com/article-1',
'https://another.com/post-2',
]),
wordCount: 2,
errorMessage: null,
));
$this->app->instance(FetchPageAction::class, $fetcher);
$page = Page::factory()->createQuietly(['url' => 'https://source.com/article']);
$crawl = PageCrawl::factory()->page($page)->createQuietly();
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class));
$this->assertDatabaseHas('pages', ['url' => 'https://other.com/article-1']);
$this->assertDatabaseHas('pages', ['url' => 'https://another.com/post-2']);
$this->assertSame(3, Page::count());
}
}

View file

@ -32,16 +32,6 @@ public function test_creating_a_page_inserts_a_page_crawl_row(): void
$this->assertNotNull($crawl);
}
public function test_created_page_crawl_has_null_outcome(): void
{
$page = Page::factory()->create(['url' => 'https://example-blog.com/article']);
$crawl = PageCrawl::where('page_id', $page->id)->first();
$this->assertNotNull($crawl);
$this->assertNull($crawl->outcome);
}
public function test_first_or_create_with_existing_url_does_not_insert_duplicate_crawl(): void
{
$url = 'https://example-blog.com/article';

View file

@ -9,6 +9,7 @@
use App\Models\PageCrawl;
use Carbon\Carbon;
use Illuminate\Foundation\Testing\RefreshDatabase;
use Illuminate\Support\Facades\Queue;
use Tests\TestCase;
class PageCrawlFactoryTest extends TestCase
@ -17,6 +18,8 @@ class PageCrawlFactoryTest extends TestCase
public function test_factory_successful_state_produces_success_outcome(): void
{
Queue::fake();
$page = Page::factory()->create();
$crawl = PageCrawl::factory()->page($page)->successful()->create();
@ -27,6 +30,8 @@ public function test_factory_successful_state_produces_success_outcome(): void
public function test_factory_failed_state_produces_failed_outcome_with_message(): void
{
Queue::fake();
$page = Page::factory()->create();
$crawl = PageCrawl::factory()->page($page)->failed('Connection timed out')->create();

View file

@ -9,6 +9,7 @@
use App\Models\PageCrawl;
use Carbon\Carbon;
use Illuminate\Foundation\Testing\RefreshDatabase;
use Illuminate\Support\Facades\Queue;
use Tests\TestCase;
class PageCrawlTest extends TestCase
@ -17,6 +18,8 @@ class PageCrawlTest extends TestCase
public function test_page_crawl_fillable_fields_persist_and_casts_are_applied(): void
{
Queue::fake();
$page = Page::factory()->createQuietly(['url' => 'https://example.com/page-1']);
$completedAt = Carbon::parse('2026-05-01 10:01:05');
@ -90,6 +93,8 @@ public function test_deleting_a_page_cascades_to_its_page_crawls(): void
public function test_pending_crawls_are_filtered_by_null_outcome(): void
{
Queue::fake();
// createQuietly() skips the PageObserver; this test counts rows with null/non-null
// outcome — the auto-inserted observer crawl (outcome=null) would corrupt both counts.
$page = Page::factory()->createQuietly(['url' => 'https://example.com/page-pending']);

View file

@ -10,6 +10,7 @@
use App\Models\PageLink;
use Carbon\Carbon;
use Illuminate\Foundation\Testing\RefreshDatabase;
use Illuminate\Support\Facades\Queue;
use Lvl0\FediDiscover\Config\InstanceType;
use Lvl0\FediDiscover\Models\Instance;
use Tests\TestCase;
@ -18,6 +19,12 @@ class PageTest extends TestCase
{
use RefreshDatabase;
protected function setUp(): void
{
parent::setUp();
Queue::fake();
}
public function test_page_model_fillable_fields_can_be_mass_assigned(): void
{
$page = Page::create([