From 720e4bcc1feb1b0083e07bd75b54eceddf85ed59 Mon Sep 17 00:00:00 2001 From: myrmidex Date: Sun, 26 Apr 2026 23:50:57 +0200 Subject: [PATCH] 14 - Implement ProcessCrawlJob orchestration with retry logic --- app/Jobs/ProcessCrawlJob.php | 67 +++++- app/Models/PageCrawl.php | 3 + tests/Feature/Jobs/ProcessCrawlJobTest.php | 259 +++++++++++++++++++++ tests/Feature/PageQueuePopulationTest.php | 10 - tests/Unit/Models/PageCrawlFactoryTest.php | 5 + tests/Unit/Models/PageCrawlTest.php | 5 + tests/Unit/Models/PageTest.php | 7 + 7 files changed, 344 insertions(+), 12 deletions(-) diff --git a/app/Jobs/ProcessCrawlJob.php b/app/Jobs/ProcessCrawlJob.php index 5c55b67..a8b4513 100644 --- a/app/Jobs/ProcessCrawlJob.php +++ b/app/Jobs/ProcessCrawlJob.php @@ -2,7 +2,12 @@ namespace App\Jobs; +use App\Actions\FetchPageAction; +use App\Actions\RegisterDiscoveredPageAction; +use App\Enums\CrawlOutcomeEnum; +use App\Enums\PageStatusEnum; use App\Models\PageCrawl; +use App\ValueObjects\FetchResult; use Illuminate\Contracts\Queue\ShouldQueue; use Illuminate\Foundation\Queue\Queueable; @@ -14,8 +19,66 @@ public function __construct( public PageCrawl $pageCrawl, ) {} - public function handle(): void + public function handle( + FetchPageAction $fetcher, + RegisterDiscoveredPageAction $register, + ): void { + /** @var FetchResult $result */ + $result = $fetcher($this->pageCrawl->page->url); + + $this->pageCrawl->update([ + 'outcome' => CrawlOutcomeEnum::Success, + 'completed_at' => now(), + 'status_code' => 200, + ]); + + $update = match ($result->outcome) { + CrawlOutcomeEnum::Rejected => [ + 'status' => PageStatusEnum::Rejected, + 'fetched_at' => null, + ], + CrawlOutcomeEnum::Timeout => [ + 'status' => PageStatusEnum::Failed, + 'failed_at' => now(), + ], + CrawlOutcomeEnum::Blocked4xx => [ + 'status' => PageStatusEnum::Failed, + 'failed_at' => now(), + ], + default => [ + 'status' => PageStatusEnum::Fetched, + 'fetched_at' => now(), + 'title' => $result->title, + ], + }; + + $result->outboundLinks->each(fn (string $url) => $register($url)); + + $this->pageCrawl->page->update($update); + + if (in_array($result->outcome, [ + CrawlOutcomeEnum::Failed, + CrawlOutcomeEnum::Timeout, + CrawlOutcomeEnum::Blocked5xx, + ])) { + $this->scheduleRetryIfNeeded($result, $this->pageCrawl); + } + } + + private function scheduleRetryIfNeeded(FetchResult $result, PageCrawl $crawl): void { - // + if (PageCrawl::where('page_id', $crawl->page_id)->count() >= 3) { + return; + } + + $newRow = PageCrawl::withoutEvents( + fn () => PageCrawl::create( + array_merge($crawl->toArray(), [ + 'outcome' => null, + ]) + ) + ); + + ProcessCrawlJob::dispatch($newRow)->delay(now()->addHour()); } } diff --git a/app/Models/PageCrawl.php b/app/Models/PageCrawl.php index 27cfe40..ba2ba29 100644 --- a/app/Models/PageCrawl.php +++ b/app/Models/PageCrawl.php @@ -35,6 +35,9 @@ class PageCrawl extends Model 'status_code' => 'integer', ]; + /** + * @return BelongsTo + */ public function page(): BelongsTo { return $this->belongsTo(Page::class); diff --git a/tests/Feature/Jobs/ProcessCrawlJobTest.php b/tests/Feature/Jobs/ProcessCrawlJobTest.php index 98c632b..8503089 100644 --- a/tests/Feature/Jobs/ProcessCrawlJobTest.php +++ b/tests/Feature/Jobs/ProcessCrawlJobTest.php @@ -4,11 +4,18 @@ namespace Tests\Feature\Jobs; +use App\Actions\FetchPageAction; +use App\Actions\RegisterDiscoveredPageAction; +use App\Enums\CrawlOutcomeEnum; +use App\Enums\PageStatusEnum; use App\Jobs\ProcessCrawlJob; use App\Models\Page; use App\Models\PageCrawl; +use App\ValueObjects\FetchResult; +use Carbon\Carbon; use Illuminate\Foundation\Testing\RefreshDatabase; use Illuminate\Support\Facades\Queue; +use Mockery; use Tests\TestCase; class ProcessCrawlJobTest extends TestCase @@ -37,4 +44,256 @@ public function test_dispatched_job_carries_the_correct_page_crawl(): void fn (ProcessCrawlJob $job) => $job->pageCrawl->id === $crawl->id, ); } + + public function test_handle_writes_outcome_to_page_crawl_on_success(): void + { + Queue::fake(); + + $fetcher = Mockery::mock(FetchPageAction::class); + $fetcher->shouldReceive('__invoke')->andReturn(new FetchResult( + outcome: CrawlOutcomeEnum::Success, + statusCode: 200, + finalUrl: 'https://example.com/article', + title: 'Hello', + extractedText: 'hi', + outboundLinks: collect(), + wordCount: 1, + errorMessage: null, + )); + $this->app->instance(FetchPageAction::class, $fetcher); + + $page = Page::factory()->createQuietly(['url' => 'https://example.com/article']); + $crawl = PageCrawl::factory()->page($page)->createQuietly(); + + app(ProcessCrawlJob::class, ['pageCrawl' => $crawl]) + ->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class)); + + $fresh = $crawl->fresh(); + $this->assertSame(CrawlOutcomeEnum::Success, $fresh->outcome); + $this->assertNotNull($fresh->completed_at); + $this->assertInstanceOf(Carbon::class, $fresh->completed_at); + $this->assertSame(200, $fresh->status_code); + $this->assertNull($fresh->error_message); + } + + public function test_handle_updates_page_to_fetched_on_success(): void + { + Queue::fake(); + + $fetcher = Mockery::mock(FetchPageAction::class); + $fetcher->shouldReceive('__invoke')->andReturn(new FetchResult( + outcome: CrawlOutcomeEnum::Success, + statusCode: 200, + finalUrl: 'https://example.com/article', + title: 'Hello', + extractedText: 'hi', + outboundLinks: collect(), + wordCount: 1, + errorMessage: null, + )); + $this->app->instance(FetchPageAction::class, $fetcher); + + $page = Page::factory()->createQuietly(['url' => 'https://example.com/article']); + $crawl = PageCrawl::factory()->page($page)->createQuietly(); + + app(ProcessCrawlJob::class, ['pageCrawl' => $crawl]) + ->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class)); + + $fresh = $page->fresh(); + $this->assertSame(PageStatusEnum::Fetched, $fresh->status); + $this->assertNotNull($fresh->fetched_at); + $this->assertInstanceOf(Carbon::class, $fresh->fetched_at); + $this->assertSame('Hello', $fresh->title); + } + + public function test_handle_updates_page_to_rejected_on_rejected_outcome(): void + { + Queue::fake(); + + $fetcher = Mockery::mock(FetchPageAction::class); + $fetcher->shouldReceive('__invoke')->andReturn(new FetchResult( + outcome: CrawlOutcomeEnum::Rejected, + statusCode: 200, + finalUrl: null, + title: null, + extractedText: null, + outboundLinks: collect(), + wordCount: null, + errorMessage: 'Unsupported Content-Type: application/pdf', + )); + $this->app->instance(FetchPageAction::class, $fetcher); + + $page = Page::factory()->createQuietly(['url' => 'https://example.com/brochure.pdf']); + $crawl = PageCrawl::factory()->page($page)->createQuietly(); + + app(ProcessCrawlJob::class, ['pageCrawl' => $crawl]) + ->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class)); + + $fresh = $page->fresh(); + $this->assertSame(PageStatusEnum::Rejected, $fresh->status); + $this->assertNull($fresh->fetched_at); + } + + public function test_handle_updates_page_to_failed_on_blocked_4xx(): void + { + Queue::fake(); + + $fetcher = Mockery::mock(FetchPageAction::class); + $fetcher->shouldReceive('__invoke')->andReturn(new FetchResult( + outcome: CrawlOutcomeEnum::Blocked4xx, + statusCode: 404, + finalUrl: null, + title: null, + extractedText: null, + outboundLinks: collect(), + wordCount: null, + errorMessage: 'HTTP 404', + )); + $this->app->instance(FetchPageAction::class, $fetcher); + + $page = Page::factory()->createQuietly(['url' => 'https://example.com/gone']); + $crawl = PageCrawl::factory()->page($page)->createQuietly(); + + app(ProcessCrawlJob::class, ['pageCrawl' => $crawl]) + ->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class)); + + $fresh = $page->fresh(); + $this->assertSame(PageStatusEnum::Failed, $fresh->status); + $this->assertNotNull($fresh->failed_at); + $this->assertInstanceOf(Carbon::class, $fresh->failed_at); + } + + public function test_handle_updates_page_to_failed_on_timeout(): void + { + Queue::fake(); + + $fetcher = Mockery::mock(FetchPageAction::class); + $fetcher->shouldReceive('__invoke')->andReturn(new FetchResult( + outcome: CrawlOutcomeEnum::Timeout, + statusCode: null, + finalUrl: null, + title: null, + extractedText: null, + outboundLinks: collect(), + wordCount: null, + errorMessage: 'Connection timed out after 10 seconds', + )); + $this->app->instance(FetchPageAction::class, $fetcher); + + $page = Page::factory()->createQuietly(['url' => 'https://example.com/slow']); + $crawl = PageCrawl::factory()->page($page)->createQuietly(); + + app(ProcessCrawlJob::class, ['pageCrawl' => $crawl]) + ->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class)); + + $fresh = $page->fresh(); + $this->assertSame(PageStatusEnum::Failed, $fresh->status); + $this->assertNotNull($fresh->failed_at); + $this->assertInstanceOf(Carbon::class, $fresh->failed_at); + } + + public function test_handle_schedules_retry_on_transient_failure(): void + { + Queue::fake(); + + $fetcher = Mockery::mock(FetchPageAction::class); + $fetcher->shouldReceive('__invoke')->andReturn(new FetchResult( + outcome: CrawlOutcomeEnum::Failed, + statusCode: null, + finalUrl: null, + title: null, + extractedText: null, + outboundLinks: collect(), + wordCount: null, + errorMessage: 'Connection refused', + )); + $this->app->instance(FetchPageAction::class, $fetcher); + + $page = Page::factory()->createQuietly(['url' => 'https://example.com/unstable']); + $crawl = PageCrawl::factory()->page($page)->createQuietly(); + + app(ProcessCrawlJob::class, ['pageCrawl' => $crawl]) + ->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class)); + + // A second PageCrawl row (the retry) must have been inserted for the same page + $this->assertSame(2, PageCrawl::where('page_id', $page->id)->count()); + + // The new row is pending — outcome IS NULL + $retryRow = PageCrawl::where('page_id', $page->id) + ->whereNull('outcome') + ->first(); + $this->assertNotNull($retryRow); + + // A delayed ProcessCrawlJob must have been pushed for the retry row + Queue::assertPushed( + ProcessCrawlJob::class, + fn (ProcessCrawlJob $job) => $job->pageCrawl->page_id === $page->id + && $job->pageCrawl->id === $retryRow->id, + ); + } + + public function test_handle_does_not_retry_after_three_attempts(): void + { + Queue::fake(); + + $fetcher = Mockery::mock(FetchPageAction::class); + $fetcher->shouldReceive('__invoke')->andReturn(new FetchResult( + outcome: CrawlOutcomeEnum::Failed, + statusCode: null, + finalUrl: null, + title: null, + extractedText: null, + outboundLinks: collect(), + wordCount: null, + errorMessage: 'Connection refused', + )); + $this->app->instance(FetchPageAction::class, $fetcher); + + $page = Page::factory()->createQuietly(['url' => 'https://example.com/unreachable']); + + // 3 prior attempts already exist — this is the cap + PageCrawl::factory()->page($page)->failed('Connection refused')->createQuietly(); + PageCrawl::factory()->page($page)->failed('Connection refused')->createQuietly(); + $thirdCrawl = PageCrawl::factory()->page($page)->createQuietly(); + + app(ProcessCrawlJob::class, ['pageCrawl' => $thirdCrawl]) + ->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class)); + + // No 4th row must appear — retry cap reached + $this->assertSame(3, PageCrawl::where('page_id', $page->id)->count()); + + // No retry job dispatched + Queue::assertNotPushed(ProcessCrawlJob::class); + } + + public function test_handle_registers_outbound_links_on_success(): void + { + Queue::fake(); + + $fetcher = Mockery::mock(FetchPageAction::class); + $fetcher->shouldReceive('__invoke')->andReturn(new FetchResult( + outcome: CrawlOutcomeEnum::Success, + statusCode: 200, + finalUrl: 'https://source.com/article', + title: 'Source Article', + extractedText: 'some text', + outboundLinks: collect([ + 'https://other.com/article-1', + 'https://another.com/post-2', + ]), + wordCount: 2, + errorMessage: null, + )); + $this->app->instance(FetchPageAction::class, $fetcher); + + $page = Page::factory()->createQuietly(['url' => 'https://source.com/article']); + $crawl = PageCrawl::factory()->page($page)->createQuietly(); + + app(ProcessCrawlJob::class, ['pageCrawl' => $crawl]) + ->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class)); + + $this->assertDatabaseHas('pages', ['url' => 'https://other.com/article-1']); + $this->assertDatabaseHas('pages', ['url' => 'https://another.com/post-2']); + $this->assertSame(3, Page::count()); + } } diff --git a/tests/Feature/PageQueuePopulationTest.php b/tests/Feature/PageQueuePopulationTest.php index 6addcd0..943d79c 100644 --- a/tests/Feature/PageQueuePopulationTest.php +++ b/tests/Feature/PageQueuePopulationTest.php @@ -32,16 +32,6 @@ public function test_creating_a_page_inserts_a_page_crawl_row(): void $this->assertNotNull($crawl); } - public function test_created_page_crawl_has_null_outcome(): void - { - $page = Page::factory()->create(['url' => 'https://example-blog.com/article']); - - $crawl = PageCrawl::where('page_id', $page->id)->first(); - - $this->assertNotNull($crawl); - $this->assertNull($crawl->outcome); - } - public function test_first_or_create_with_existing_url_does_not_insert_duplicate_crawl(): void { $url = 'https://example-blog.com/article'; diff --git a/tests/Unit/Models/PageCrawlFactoryTest.php b/tests/Unit/Models/PageCrawlFactoryTest.php index 21990fa..df9c02f 100644 --- a/tests/Unit/Models/PageCrawlFactoryTest.php +++ b/tests/Unit/Models/PageCrawlFactoryTest.php @@ -9,6 +9,7 @@ use App\Models\PageCrawl; use Carbon\Carbon; use Illuminate\Foundation\Testing\RefreshDatabase; +use Illuminate\Support\Facades\Queue; use Tests\TestCase; class PageCrawlFactoryTest extends TestCase @@ -17,6 +18,8 @@ class PageCrawlFactoryTest extends TestCase public function test_factory_successful_state_produces_success_outcome(): void { + Queue::fake(); + $page = Page::factory()->create(); $crawl = PageCrawl::factory()->page($page)->successful()->create(); @@ -27,6 +30,8 @@ public function test_factory_successful_state_produces_success_outcome(): void public function test_factory_failed_state_produces_failed_outcome_with_message(): void { + Queue::fake(); + $page = Page::factory()->create(); $crawl = PageCrawl::factory()->page($page)->failed('Connection timed out')->create(); diff --git a/tests/Unit/Models/PageCrawlTest.php b/tests/Unit/Models/PageCrawlTest.php index 73fdad0..08f8a59 100644 --- a/tests/Unit/Models/PageCrawlTest.php +++ b/tests/Unit/Models/PageCrawlTest.php @@ -9,6 +9,7 @@ use App\Models\PageCrawl; use Carbon\Carbon; use Illuminate\Foundation\Testing\RefreshDatabase; +use Illuminate\Support\Facades\Queue; use Tests\TestCase; class PageCrawlTest extends TestCase @@ -17,6 +18,8 @@ class PageCrawlTest extends TestCase public function test_page_crawl_fillable_fields_persist_and_casts_are_applied(): void { + Queue::fake(); + $page = Page::factory()->createQuietly(['url' => 'https://example.com/page-1']); $completedAt = Carbon::parse('2026-05-01 10:01:05'); @@ -90,6 +93,8 @@ public function test_deleting_a_page_cascades_to_its_page_crawls(): void public function test_pending_crawls_are_filtered_by_null_outcome(): void { + Queue::fake(); + // createQuietly() skips the PageObserver; this test counts rows with null/non-null // outcome — the auto-inserted observer crawl (outcome=null) would corrupt both counts. $page = Page::factory()->createQuietly(['url' => 'https://example.com/page-pending']); diff --git a/tests/Unit/Models/PageTest.php b/tests/Unit/Models/PageTest.php index 27e9740..3e08b56 100644 --- a/tests/Unit/Models/PageTest.php +++ b/tests/Unit/Models/PageTest.php @@ -10,6 +10,7 @@ use App\Models\PageLink; use Carbon\Carbon; use Illuminate\Foundation\Testing\RefreshDatabase; +use Illuminate\Support\Facades\Queue; use Lvl0\FediDiscover\Config\InstanceType; use Lvl0\FediDiscover\Models\Instance; use Tests\TestCase; @@ -18,6 +19,12 @@ class PageTest extends TestCase { use RefreshDatabase; + protected function setUp(): void + { + parent::setUp(); + Queue::fake(); + } + public function test_page_model_fillable_fields_can_be_mass_assigned(): void { $page = Page::create([