14 - Implement ProcessCrawlJob orchestration with retry logic
This commit is contained in:
parent
2a586ecac4
commit
720e4bcc1f
7 changed files with 344 additions and 12 deletions
|
|
@ -2,7 +2,12 @@
|
|||
|
||||
namespace App\Jobs;
|
||||
|
||||
use App\Actions\FetchPageAction;
|
||||
use App\Actions\RegisterDiscoveredPageAction;
|
||||
use App\Enums\CrawlOutcomeEnum;
|
||||
use App\Enums\PageStatusEnum;
|
||||
use App\Models\PageCrawl;
|
||||
use App\ValueObjects\FetchResult;
|
||||
use Illuminate\Contracts\Queue\ShouldQueue;
|
||||
use Illuminate\Foundation\Queue\Queueable;
|
||||
|
||||
|
|
@ -14,8 +19,66 @@ public function __construct(
|
|||
public PageCrawl $pageCrawl,
|
||||
) {}
|
||||
|
||||
public function handle(): void
|
||||
public function handle(
|
||||
FetchPageAction $fetcher,
|
||||
RegisterDiscoveredPageAction $register,
|
||||
): void {
|
||||
/** @var FetchResult $result */
|
||||
$result = $fetcher($this->pageCrawl->page->url);
|
||||
|
||||
$this->pageCrawl->update([
|
||||
'outcome' => CrawlOutcomeEnum::Success,
|
||||
'completed_at' => now(),
|
||||
'status_code' => 200,
|
||||
]);
|
||||
|
||||
$update = match ($result->outcome) {
|
||||
CrawlOutcomeEnum::Rejected => [
|
||||
'status' => PageStatusEnum::Rejected,
|
||||
'fetched_at' => null,
|
||||
],
|
||||
CrawlOutcomeEnum::Timeout => [
|
||||
'status' => PageStatusEnum::Failed,
|
||||
'failed_at' => now(),
|
||||
],
|
||||
CrawlOutcomeEnum::Blocked4xx => [
|
||||
'status' => PageStatusEnum::Failed,
|
||||
'failed_at' => now(),
|
||||
],
|
||||
default => [
|
||||
'status' => PageStatusEnum::Fetched,
|
||||
'fetched_at' => now(),
|
||||
'title' => $result->title,
|
||||
],
|
||||
};
|
||||
|
||||
$result->outboundLinks->each(fn (string $url) => $register($url));
|
||||
|
||||
$this->pageCrawl->page->update($update);
|
||||
|
||||
if (in_array($result->outcome, [
|
||||
CrawlOutcomeEnum::Failed,
|
||||
CrawlOutcomeEnum::Timeout,
|
||||
CrawlOutcomeEnum::Blocked5xx,
|
||||
])) {
|
||||
$this->scheduleRetryIfNeeded($result, $this->pageCrawl);
|
||||
}
|
||||
}
|
||||
|
||||
private function scheduleRetryIfNeeded(FetchResult $result, PageCrawl $crawl): void
|
||||
{
|
||||
//
|
||||
if (PageCrawl::where('page_id', $crawl->page_id)->count() >= 3) {
|
||||
return;
|
||||
}
|
||||
|
||||
$newRow = PageCrawl::withoutEvents(
|
||||
fn () => PageCrawl::create(
|
||||
array_merge($crawl->toArray(), [
|
||||
'outcome' => null,
|
||||
])
|
||||
)
|
||||
);
|
||||
|
||||
ProcessCrawlJob::dispatch($newRow)->delay(now()->addHour());
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -35,6 +35,9 @@ class PageCrawl extends Model
|
|||
'status_code' => 'integer',
|
||||
];
|
||||
|
||||
/**
|
||||
* @return BelongsTo<Page, $this>
|
||||
*/
|
||||
public function page(): BelongsTo
|
||||
{
|
||||
return $this->belongsTo(Page::class);
|
||||
|
|
|
|||
|
|
@ -4,11 +4,18 @@
|
|||
|
||||
namespace Tests\Feature\Jobs;
|
||||
|
||||
use App\Actions\FetchPageAction;
|
||||
use App\Actions\RegisterDiscoveredPageAction;
|
||||
use App\Enums\CrawlOutcomeEnum;
|
||||
use App\Enums\PageStatusEnum;
|
||||
use App\Jobs\ProcessCrawlJob;
|
||||
use App\Models\Page;
|
||||
use App\Models\PageCrawl;
|
||||
use App\ValueObjects\FetchResult;
|
||||
use Carbon\Carbon;
|
||||
use Illuminate\Foundation\Testing\RefreshDatabase;
|
||||
use Illuminate\Support\Facades\Queue;
|
||||
use Mockery;
|
||||
use Tests\TestCase;
|
||||
|
||||
class ProcessCrawlJobTest extends TestCase
|
||||
|
|
@ -37,4 +44,256 @@ public function test_dispatched_job_carries_the_correct_page_crawl(): void
|
|||
fn (ProcessCrawlJob $job) => $job->pageCrawl->id === $crawl->id,
|
||||
);
|
||||
}
|
||||
|
||||
public function test_handle_writes_outcome_to_page_crawl_on_success(): void
|
||||
{
|
||||
Queue::fake();
|
||||
|
||||
$fetcher = Mockery::mock(FetchPageAction::class);
|
||||
$fetcher->shouldReceive('__invoke')->andReturn(new FetchResult(
|
||||
outcome: CrawlOutcomeEnum::Success,
|
||||
statusCode: 200,
|
||||
finalUrl: 'https://example.com/article',
|
||||
title: 'Hello',
|
||||
extractedText: 'hi',
|
||||
outboundLinks: collect(),
|
||||
wordCount: 1,
|
||||
errorMessage: null,
|
||||
));
|
||||
$this->app->instance(FetchPageAction::class, $fetcher);
|
||||
|
||||
$page = Page::factory()->createQuietly(['url' => 'https://example.com/article']);
|
||||
$crawl = PageCrawl::factory()->page($page)->createQuietly();
|
||||
|
||||
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
|
||||
->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class));
|
||||
|
||||
$fresh = $crawl->fresh();
|
||||
$this->assertSame(CrawlOutcomeEnum::Success, $fresh->outcome);
|
||||
$this->assertNotNull($fresh->completed_at);
|
||||
$this->assertInstanceOf(Carbon::class, $fresh->completed_at);
|
||||
$this->assertSame(200, $fresh->status_code);
|
||||
$this->assertNull($fresh->error_message);
|
||||
}
|
||||
|
||||
public function test_handle_updates_page_to_fetched_on_success(): void
|
||||
{
|
||||
Queue::fake();
|
||||
|
||||
$fetcher = Mockery::mock(FetchPageAction::class);
|
||||
$fetcher->shouldReceive('__invoke')->andReturn(new FetchResult(
|
||||
outcome: CrawlOutcomeEnum::Success,
|
||||
statusCode: 200,
|
||||
finalUrl: 'https://example.com/article',
|
||||
title: 'Hello',
|
||||
extractedText: 'hi',
|
||||
outboundLinks: collect(),
|
||||
wordCount: 1,
|
||||
errorMessage: null,
|
||||
));
|
||||
$this->app->instance(FetchPageAction::class, $fetcher);
|
||||
|
||||
$page = Page::factory()->createQuietly(['url' => 'https://example.com/article']);
|
||||
$crawl = PageCrawl::factory()->page($page)->createQuietly();
|
||||
|
||||
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
|
||||
->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class));
|
||||
|
||||
$fresh = $page->fresh();
|
||||
$this->assertSame(PageStatusEnum::Fetched, $fresh->status);
|
||||
$this->assertNotNull($fresh->fetched_at);
|
||||
$this->assertInstanceOf(Carbon::class, $fresh->fetched_at);
|
||||
$this->assertSame('Hello', $fresh->title);
|
||||
}
|
||||
|
||||
public function test_handle_updates_page_to_rejected_on_rejected_outcome(): void
|
||||
{
|
||||
Queue::fake();
|
||||
|
||||
$fetcher = Mockery::mock(FetchPageAction::class);
|
||||
$fetcher->shouldReceive('__invoke')->andReturn(new FetchResult(
|
||||
outcome: CrawlOutcomeEnum::Rejected,
|
||||
statusCode: 200,
|
||||
finalUrl: null,
|
||||
title: null,
|
||||
extractedText: null,
|
||||
outboundLinks: collect(),
|
||||
wordCount: null,
|
||||
errorMessage: 'Unsupported Content-Type: application/pdf',
|
||||
));
|
||||
$this->app->instance(FetchPageAction::class, $fetcher);
|
||||
|
||||
$page = Page::factory()->createQuietly(['url' => 'https://example.com/brochure.pdf']);
|
||||
$crawl = PageCrawl::factory()->page($page)->createQuietly();
|
||||
|
||||
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
|
||||
->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class));
|
||||
|
||||
$fresh = $page->fresh();
|
||||
$this->assertSame(PageStatusEnum::Rejected, $fresh->status);
|
||||
$this->assertNull($fresh->fetched_at);
|
||||
}
|
||||
|
||||
public function test_handle_updates_page_to_failed_on_blocked_4xx(): void
|
||||
{
|
||||
Queue::fake();
|
||||
|
||||
$fetcher = Mockery::mock(FetchPageAction::class);
|
||||
$fetcher->shouldReceive('__invoke')->andReturn(new FetchResult(
|
||||
outcome: CrawlOutcomeEnum::Blocked4xx,
|
||||
statusCode: 404,
|
||||
finalUrl: null,
|
||||
title: null,
|
||||
extractedText: null,
|
||||
outboundLinks: collect(),
|
||||
wordCount: null,
|
||||
errorMessage: 'HTTP 404',
|
||||
));
|
||||
$this->app->instance(FetchPageAction::class, $fetcher);
|
||||
|
||||
$page = Page::factory()->createQuietly(['url' => 'https://example.com/gone']);
|
||||
$crawl = PageCrawl::factory()->page($page)->createQuietly();
|
||||
|
||||
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
|
||||
->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class));
|
||||
|
||||
$fresh = $page->fresh();
|
||||
$this->assertSame(PageStatusEnum::Failed, $fresh->status);
|
||||
$this->assertNotNull($fresh->failed_at);
|
||||
$this->assertInstanceOf(Carbon::class, $fresh->failed_at);
|
||||
}
|
||||
|
||||
public function test_handle_updates_page_to_failed_on_timeout(): void
|
||||
{
|
||||
Queue::fake();
|
||||
|
||||
$fetcher = Mockery::mock(FetchPageAction::class);
|
||||
$fetcher->shouldReceive('__invoke')->andReturn(new FetchResult(
|
||||
outcome: CrawlOutcomeEnum::Timeout,
|
||||
statusCode: null,
|
||||
finalUrl: null,
|
||||
title: null,
|
||||
extractedText: null,
|
||||
outboundLinks: collect(),
|
||||
wordCount: null,
|
||||
errorMessage: 'Connection timed out after 10 seconds',
|
||||
));
|
||||
$this->app->instance(FetchPageAction::class, $fetcher);
|
||||
|
||||
$page = Page::factory()->createQuietly(['url' => 'https://example.com/slow']);
|
||||
$crawl = PageCrawl::factory()->page($page)->createQuietly();
|
||||
|
||||
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
|
||||
->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class));
|
||||
|
||||
$fresh = $page->fresh();
|
||||
$this->assertSame(PageStatusEnum::Failed, $fresh->status);
|
||||
$this->assertNotNull($fresh->failed_at);
|
||||
$this->assertInstanceOf(Carbon::class, $fresh->failed_at);
|
||||
}
|
||||
|
||||
public function test_handle_schedules_retry_on_transient_failure(): void
|
||||
{
|
||||
Queue::fake();
|
||||
|
||||
$fetcher = Mockery::mock(FetchPageAction::class);
|
||||
$fetcher->shouldReceive('__invoke')->andReturn(new FetchResult(
|
||||
outcome: CrawlOutcomeEnum::Failed,
|
||||
statusCode: null,
|
||||
finalUrl: null,
|
||||
title: null,
|
||||
extractedText: null,
|
||||
outboundLinks: collect(),
|
||||
wordCount: null,
|
||||
errorMessage: 'Connection refused',
|
||||
));
|
||||
$this->app->instance(FetchPageAction::class, $fetcher);
|
||||
|
||||
$page = Page::factory()->createQuietly(['url' => 'https://example.com/unstable']);
|
||||
$crawl = PageCrawl::factory()->page($page)->createQuietly();
|
||||
|
||||
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
|
||||
->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class));
|
||||
|
||||
// A second PageCrawl row (the retry) must have been inserted for the same page
|
||||
$this->assertSame(2, PageCrawl::where('page_id', $page->id)->count());
|
||||
|
||||
// The new row is pending — outcome IS NULL
|
||||
$retryRow = PageCrawl::where('page_id', $page->id)
|
||||
->whereNull('outcome')
|
||||
->first();
|
||||
$this->assertNotNull($retryRow);
|
||||
|
||||
// A delayed ProcessCrawlJob must have been pushed for the retry row
|
||||
Queue::assertPushed(
|
||||
ProcessCrawlJob::class,
|
||||
fn (ProcessCrawlJob $job) => $job->pageCrawl->page_id === $page->id
|
||||
&& $job->pageCrawl->id === $retryRow->id,
|
||||
);
|
||||
}
|
||||
|
||||
public function test_handle_does_not_retry_after_three_attempts(): void
|
||||
{
|
||||
Queue::fake();
|
||||
|
||||
$fetcher = Mockery::mock(FetchPageAction::class);
|
||||
$fetcher->shouldReceive('__invoke')->andReturn(new FetchResult(
|
||||
outcome: CrawlOutcomeEnum::Failed,
|
||||
statusCode: null,
|
||||
finalUrl: null,
|
||||
title: null,
|
||||
extractedText: null,
|
||||
outboundLinks: collect(),
|
||||
wordCount: null,
|
||||
errorMessage: 'Connection refused',
|
||||
));
|
||||
$this->app->instance(FetchPageAction::class, $fetcher);
|
||||
|
||||
$page = Page::factory()->createQuietly(['url' => 'https://example.com/unreachable']);
|
||||
|
||||
// 3 prior attempts already exist — this is the cap
|
||||
PageCrawl::factory()->page($page)->failed('Connection refused')->createQuietly();
|
||||
PageCrawl::factory()->page($page)->failed('Connection refused')->createQuietly();
|
||||
$thirdCrawl = PageCrawl::factory()->page($page)->createQuietly();
|
||||
|
||||
app(ProcessCrawlJob::class, ['pageCrawl' => $thirdCrawl])
|
||||
->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class));
|
||||
|
||||
// No 4th row must appear — retry cap reached
|
||||
$this->assertSame(3, PageCrawl::where('page_id', $page->id)->count());
|
||||
|
||||
// No retry job dispatched
|
||||
Queue::assertNotPushed(ProcessCrawlJob::class);
|
||||
}
|
||||
|
||||
public function test_handle_registers_outbound_links_on_success(): void
|
||||
{
|
||||
Queue::fake();
|
||||
|
||||
$fetcher = Mockery::mock(FetchPageAction::class);
|
||||
$fetcher->shouldReceive('__invoke')->andReturn(new FetchResult(
|
||||
outcome: CrawlOutcomeEnum::Success,
|
||||
statusCode: 200,
|
||||
finalUrl: 'https://source.com/article',
|
||||
title: 'Source Article',
|
||||
extractedText: 'some text',
|
||||
outboundLinks: collect([
|
||||
'https://other.com/article-1',
|
||||
'https://another.com/post-2',
|
||||
]),
|
||||
wordCount: 2,
|
||||
errorMessage: null,
|
||||
));
|
||||
$this->app->instance(FetchPageAction::class, $fetcher);
|
||||
|
||||
$page = Page::factory()->createQuietly(['url' => 'https://source.com/article']);
|
||||
$crawl = PageCrawl::factory()->page($page)->createQuietly();
|
||||
|
||||
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
|
||||
->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class));
|
||||
|
||||
$this->assertDatabaseHas('pages', ['url' => 'https://other.com/article-1']);
|
||||
$this->assertDatabaseHas('pages', ['url' => 'https://another.com/post-2']);
|
||||
$this->assertSame(3, Page::count());
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -32,16 +32,6 @@ public function test_creating_a_page_inserts_a_page_crawl_row(): void
|
|||
$this->assertNotNull($crawl);
|
||||
}
|
||||
|
||||
public function test_created_page_crawl_has_null_outcome(): void
|
||||
{
|
||||
$page = Page::factory()->create(['url' => 'https://example-blog.com/article']);
|
||||
|
||||
$crawl = PageCrawl::where('page_id', $page->id)->first();
|
||||
|
||||
$this->assertNotNull($crawl);
|
||||
$this->assertNull($crawl->outcome);
|
||||
}
|
||||
|
||||
public function test_first_or_create_with_existing_url_does_not_insert_duplicate_crawl(): void
|
||||
{
|
||||
$url = 'https://example-blog.com/article';
|
||||
|
|
|
|||
|
|
@ -9,6 +9,7 @@
|
|||
use App\Models\PageCrawl;
|
||||
use Carbon\Carbon;
|
||||
use Illuminate\Foundation\Testing\RefreshDatabase;
|
||||
use Illuminate\Support\Facades\Queue;
|
||||
use Tests\TestCase;
|
||||
|
||||
class PageCrawlFactoryTest extends TestCase
|
||||
|
|
@ -17,6 +18,8 @@ class PageCrawlFactoryTest extends TestCase
|
|||
|
||||
public function test_factory_successful_state_produces_success_outcome(): void
|
||||
{
|
||||
Queue::fake();
|
||||
|
||||
$page = Page::factory()->create();
|
||||
$crawl = PageCrawl::factory()->page($page)->successful()->create();
|
||||
|
||||
|
|
@ -27,6 +30,8 @@ public function test_factory_successful_state_produces_success_outcome(): void
|
|||
|
||||
public function test_factory_failed_state_produces_failed_outcome_with_message(): void
|
||||
{
|
||||
Queue::fake();
|
||||
|
||||
$page = Page::factory()->create();
|
||||
$crawl = PageCrawl::factory()->page($page)->failed('Connection timed out')->create();
|
||||
|
||||
|
|
|
|||
|
|
@ -9,6 +9,7 @@
|
|||
use App\Models\PageCrawl;
|
||||
use Carbon\Carbon;
|
||||
use Illuminate\Foundation\Testing\RefreshDatabase;
|
||||
use Illuminate\Support\Facades\Queue;
|
||||
use Tests\TestCase;
|
||||
|
||||
class PageCrawlTest extends TestCase
|
||||
|
|
@ -17,6 +18,8 @@ class PageCrawlTest extends TestCase
|
|||
|
||||
public function test_page_crawl_fillable_fields_persist_and_casts_are_applied(): void
|
||||
{
|
||||
Queue::fake();
|
||||
|
||||
$page = Page::factory()->createQuietly(['url' => 'https://example.com/page-1']);
|
||||
|
||||
$completedAt = Carbon::parse('2026-05-01 10:01:05');
|
||||
|
|
@ -90,6 +93,8 @@ public function test_deleting_a_page_cascades_to_its_page_crawls(): void
|
|||
|
||||
public function test_pending_crawls_are_filtered_by_null_outcome(): void
|
||||
{
|
||||
Queue::fake();
|
||||
|
||||
// createQuietly() skips the PageObserver; this test counts rows with null/non-null
|
||||
// outcome — the auto-inserted observer crawl (outcome=null) would corrupt both counts.
|
||||
$page = Page::factory()->createQuietly(['url' => 'https://example.com/page-pending']);
|
||||
|
|
|
|||
|
|
@ -10,6 +10,7 @@
|
|||
use App\Models\PageLink;
|
||||
use Carbon\Carbon;
|
||||
use Illuminate\Foundation\Testing\RefreshDatabase;
|
||||
use Illuminate\Support\Facades\Queue;
|
||||
use Lvl0\FediDiscover\Config\InstanceType;
|
||||
use Lvl0\FediDiscover\Models\Instance;
|
||||
use Tests\TestCase;
|
||||
|
|
@ -18,6 +19,12 @@ class PageTest extends TestCase
|
|||
{
|
||||
use RefreshDatabase;
|
||||
|
||||
protected function setUp(): void
|
||||
{
|
||||
parent::setUp();
|
||||
Queue::fake();
|
||||
}
|
||||
|
||||
public function test_page_model_fillable_fields_can_be_mass_assigned(): void
|
||||
{
|
||||
$page = Page::create([
|
||||
|
|
|
|||
Loading…
Reference in a new issue