2026-04-26 21:15:07 +02:00
< ? php
declare ( strict_types = 1 );
namespace Tests\Feature\Jobs ;
2026-04-26 23:50:57 +02:00
use App\Actions\FetchPageAction ;
use App\Enums\CrawlOutcomeEnum ;
use App\Enums\PageStatusEnum ;
2026-04-26 21:15:07 +02:00
use App\Jobs\ProcessCrawlJob ;
use App\Models\Page ;
use App\Models\PageCrawl ;
2026-04-26 23:50:57 +02:00
use App\ValueObjects\FetchResult ;
use Carbon\Carbon ;
2026-04-26 21:15:07 +02:00
use Illuminate\Foundation\Testing\RefreshDatabase ;
2026-04-27 00:24:38 +02:00
use Illuminate\Support\Collection ;
2026-04-27 01:25:46 +02:00
use Illuminate\Support\Facades\Cache ;
2026-04-27 23:53:52 +02:00
use Illuminate\Support\Facades\Http ;
2026-04-26 21:15:07 +02:00
use Illuminate\Support\Facades\Queue ;
2026-04-26 23:50:57 +02:00
use Mockery ;
2026-04-26 21:15:07 +02:00
use Tests\TestCase ;
class ProcessCrawlJobTest extends TestCase
{
use RefreshDatabase ;
public function test_creating_a_page_crawl_dispatches_process_crawl_job () : void
{
Queue :: fake ();
$page = Page :: factory () -> createQuietly ([ 'url' => 'https://example.com/article' ]);
PageCrawl :: factory () -> page ( $page ) -> create ();
Queue :: assertPushed ( ProcessCrawlJob :: class );
}
public function test_dispatched_job_carries_the_correct_page_crawl () : void
{
Queue :: fake ();
$page = Page :: factory () -> createQuietly ([ 'url' => 'https://example.com/article' ]);
$crawl = PageCrawl :: factory () -> page ( $page ) -> create ();
Queue :: assertPushed (
ProcessCrawlJob :: class ,
fn ( ProcessCrawlJob $job ) => $job -> pageCrawl -> id === $crawl -> id ,
);
}
2026-04-26 23:50:57 +02:00
public function test_handle_writes_outcome_to_page_crawl_on_success () : void
{
Queue :: fake ();
2026-04-27 00:24:38 +02:00
$this -> mockFetchPageAction ( CrawlOutcomeEnum :: Success , statusCode : 200 , title : 'Hello' , extractedText : 'hi' , wordCount : 1 );
2026-04-26 23:50:57 +02:00
$page = Page :: factory () -> createQuietly ([ 'url' => 'https://example.com/article' ]);
$crawl = PageCrawl :: factory () -> page ( $page ) -> createQuietly ();
app ( ProcessCrawlJob :: class , [ 'pageCrawl' => $crawl ])
2026-04-27 01:25:46 +02:00
-> handle ();
2026-04-26 23:50:57 +02:00
$fresh = $crawl -> fresh ();
$this -> assertSame ( CrawlOutcomeEnum :: Success , $fresh -> outcome );
$this -> assertNotNull ( $fresh -> completed_at );
$this -> assertInstanceOf ( Carbon :: class , $fresh -> completed_at );
$this -> assertSame ( 200 , $fresh -> status_code );
$this -> assertNull ( $fresh -> error_message );
}
public function test_handle_updates_page_to_fetched_on_success () : void
{
Queue :: fake ();
2026-04-27 00:24:38 +02:00
$this -> mockFetchPageAction ( CrawlOutcomeEnum :: Success , statusCode : 200 , title : 'Hello' , extractedText : 'hi' , wordCount : 1 );
2026-04-26 23:50:57 +02:00
$page = Page :: factory () -> createQuietly ([ 'url' => 'https://example.com/article' ]);
$crawl = PageCrawl :: factory () -> page ( $page ) -> createQuietly ();
app ( ProcessCrawlJob :: class , [ 'pageCrawl' => $crawl ])
2026-04-27 01:25:46 +02:00
-> handle ();
2026-04-26 23:50:57 +02:00
$fresh = $page -> fresh ();
$this -> assertSame ( PageStatusEnum :: Fetched , $fresh -> status );
$this -> assertNotNull ( $fresh -> fetched_at );
$this -> assertInstanceOf ( Carbon :: class , $fresh -> fetched_at );
$this -> assertSame ( 'Hello' , $fresh -> title );
}
public function test_handle_updates_page_to_rejected_on_rejected_outcome () : void
{
Queue :: fake ();
2026-04-27 00:24:38 +02:00
$this -> mockFetchPageAction ( CrawlOutcomeEnum :: Rejected , statusCode : 200 , errorMessage : 'Unsupported Content-Type: application/pdf' );
2026-04-26 23:50:57 +02:00
$page = Page :: factory () -> createQuietly ([ 'url' => 'https://example.com/brochure.pdf' ]);
$crawl = PageCrawl :: factory () -> page ( $page ) -> createQuietly ();
app ( ProcessCrawlJob :: class , [ 'pageCrawl' => $crawl ])
2026-04-27 01:25:46 +02:00
-> handle ();
2026-04-26 23:50:57 +02:00
$fresh = $page -> fresh ();
$this -> assertSame ( PageStatusEnum :: Rejected , $fresh -> status );
$this -> assertNull ( $fresh -> fetched_at );
}
public function test_handle_updates_page_to_failed_on_blocked_4xx () : void
{
Queue :: fake ();
2026-04-27 00:24:38 +02:00
$this -> mockFetchPageAction ( CrawlOutcomeEnum :: Blocked4xx , statusCode : 404 , errorMessage : 'HTTP 404' );
2026-04-26 23:50:57 +02:00
$page = Page :: factory () -> createQuietly ([ 'url' => 'https://example.com/gone' ]);
$crawl = PageCrawl :: factory () -> page ( $page ) -> createQuietly ();
app ( ProcessCrawlJob :: class , [ 'pageCrawl' => $crawl ])
2026-04-27 01:25:46 +02:00
-> handle ();
2026-04-26 23:50:57 +02:00
$fresh = $page -> fresh ();
$this -> assertSame ( PageStatusEnum :: Failed , $fresh -> status );
$this -> assertNotNull ( $fresh -> failed_at );
$this -> assertInstanceOf ( Carbon :: class , $fresh -> failed_at );
}
public function test_handle_updates_page_to_failed_on_timeout () : void
{
Queue :: fake ();
2026-04-27 00:24:38 +02:00
$this -> mockFetchPageAction ( CrawlOutcomeEnum :: Timeout , errorMessage : 'Connection timed out after 10 seconds' );
2026-04-26 23:50:57 +02:00
$page = Page :: factory () -> createQuietly ([ 'url' => 'https://example.com/slow' ]);
$crawl = PageCrawl :: factory () -> page ( $page ) -> createQuietly ();
app ( ProcessCrawlJob :: class , [ 'pageCrawl' => $crawl ])
2026-04-27 01:25:46 +02:00
-> handle ();
2026-04-26 23:50:57 +02:00
$fresh = $page -> fresh ();
$this -> assertSame ( PageStatusEnum :: Failed , $fresh -> status );
$this -> assertNotNull ( $fresh -> failed_at );
$this -> assertInstanceOf ( Carbon :: class , $fresh -> failed_at );
}
public function test_handle_schedules_retry_on_transient_failure () : void
{
Queue :: fake ();
2026-04-27 00:24:38 +02:00
$this -> mockFetchPageAction ( CrawlOutcomeEnum :: Failed , errorMessage : 'Connection refused' );
2026-04-26 23:50:57 +02:00
$page = Page :: factory () -> createQuietly ([ 'url' => 'https://example.com/unstable' ]);
$crawl = PageCrawl :: factory () -> page ( $page ) -> createQuietly ();
app ( ProcessCrawlJob :: class , [ 'pageCrawl' => $crawl ])
2026-04-27 01:25:46 +02:00
-> handle ();
2026-04-26 23:50:57 +02:00
// A second PageCrawl row (the retry) must have been inserted for the same page
$this -> assertSame ( 2 , PageCrawl :: where ( 'page_id' , $page -> id ) -> count ());
// The new row is pending — outcome IS NULL
$retryRow = PageCrawl :: where ( 'page_id' , $page -> id )
-> whereNull ( 'outcome' )
-> first ();
$this -> assertNotNull ( $retryRow );
// A delayed ProcessCrawlJob must have been pushed for the retry row
Queue :: assertPushed (
ProcessCrawlJob :: class ,
fn ( ProcessCrawlJob $job ) => $job -> pageCrawl -> page_id === $page -> id
&& $job -> pageCrawl -> id === $retryRow -> id ,
);
}
public function test_handle_does_not_retry_after_three_attempts () : void
{
Queue :: fake ();
2026-04-27 00:24:38 +02:00
$this -> mockFetchPageAction ( CrawlOutcomeEnum :: Failed , errorMessage : 'Connection refused' );
2026-04-26 23:50:57 +02:00
$page = Page :: factory () -> createQuietly ([ 'url' => 'https://example.com/unreachable' ]);
// 3 prior attempts already exist — this is the cap
PageCrawl :: factory () -> page ( $page ) -> failed ( 'Connection refused' ) -> createQuietly ();
PageCrawl :: factory () -> page ( $page ) -> failed ( 'Connection refused' ) -> createQuietly ();
$thirdCrawl = PageCrawl :: factory () -> page ( $page ) -> createQuietly ();
app ( ProcessCrawlJob :: class , [ 'pageCrawl' => $thirdCrawl ])
2026-04-27 01:25:46 +02:00
-> handle ();
2026-04-26 23:50:57 +02:00
// No 4th row must appear — retry cap reached
$this -> assertSame ( 3 , PageCrawl :: where ( 'page_id' , $page -> id ) -> count ());
// No retry job dispatched
Queue :: assertNotPushed ( ProcessCrawlJob :: class );
}
2026-04-27 00:18:34 +02:00
public function test_handle_writes_failed_outcome_to_page_crawl () : void
{
Queue :: fake ();
2026-04-27 00:24:38 +02:00
$this -> mockFetchPageAction ( CrawlOutcomeEnum :: Failed , errorMessage : 'boom' );
2026-04-27 00:18:34 +02:00
$page = Page :: factory () -> createQuietly ([ 'url' => 'https://example.com/unstable' ]);
$crawl = PageCrawl :: factory () -> page ( $page ) -> createQuietly ();
app ( ProcessCrawlJob :: class , [ 'pageCrawl' => $crawl ])
2026-04-27 01:25:46 +02:00
-> handle ();
2026-04-27 00:18:34 +02:00
$this -> assertDatabaseHas ( 'page_crawls' , [
'id' => $crawl -> id ,
'outcome' => CrawlOutcomeEnum :: Failed -> value ,
'status_code' => null ,
'error_message' => 'boom' ,
]);
}
public function test_handle_updates_page_to_failed_on_failed_outcome () : void
{
Queue :: fake ();
2026-04-27 00:24:38 +02:00
$this -> mockFetchPageAction ( CrawlOutcomeEnum :: Failed , errorMessage : 'Connection refused' );
2026-04-27 00:18:34 +02:00
$page = Page :: factory () -> createQuietly ([ 'url' => 'https://example.com/unreachable' ]);
$crawl = PageCrawl :: factory () -> page ( $page ) -> createQuietly ();
app ( ProcessCrawlJob :: class , [ 'pageCrawl' => $crawl ])
2026-04-27 01:25:46 +02:00
-> handle ();
2026-04-27 00:18:34 +02:00
$this -> assertSame ( PageStatusEnum :: Failed , $page -> fresh () -> status );
}
public function test_handle_updates_page_to_failed_on_blocked_5xx () : void
{
Queue :: fake ();
2026-04-27 00:24:38 +02:00
$this -> mockFetchPageAction ( CrawlOutcomeEnum :: Blocked5xx , statusCode : 503 , errorMessage : 'HTTP 503' );
2026-04-27 00:18:34 +02:00
$page = Page :: factory () -> createQuietly ([ 'url' => 'https://example.com/overloaded' ]);
$crawl = PageCrawl :: factory () -> page ( $page ) -> createQuietly ();
app ( ProcessCrawlJob :: class , [ 'pageCrawl' => $crawl ])
2026-04-27 01:25:46 +02:00
-> handle ();
2026-04-27 00:18:34 +02:00
$this -> assertSame ( PageStatusEnum :: Failed , $page -> fresh () -> status );
}
public function test_handle_updates_page_to_failed_on_blocked_robots () : void
{
Queue :: fake ();
2026-04-27 00:24:38 +02:00
$this -> mockFetchPageAction ( CrawlOutcomeEnum :: BlockedRobots , errorMessage : 'Disallowed by robots.txt' );
2026-04-27 00:18:34 +02:00
$page = Page :: factory () -> createQuietly ([ 'url' => 'https://example.com/private' ]);
$crawl = PageCrawl :: factory () -> page ( $page ) -> createQuietly ();
app ( ProcessCrawlJob :: class , [ 'pageCrawl' => $crawl ])
2026-04-27 01:25:46 +02:00
-> handle ();
2026-04-27 00:18:34 +02:00
$this -> assertSame ( PageStatusEnum :: Failed , $page -> fresh () -> status );
}
public function test_handle_does_not_register_outbound_links_on_failure () : void
{
Queue :: fake ();
2026-04-27 00:24:38 +02:00
$this -> mockFetchPageAction (
CrawlOutcomeEnum :: Failed ,
outboundLinks : collect ([ 'https://should-not-be-registered.com/page' ]),
2026-04-27 00:18:34 +02:00
errorMessage : 'Connection refused' ,
2026-04-27 00:24:38 +02:00
);
2026-04-27 00:18:34 +02:00
$page = Page :: factory () -> createQuietly ([ 'url' => 'https://example.com/broken' ]);
$crawl = PageCrawl :: factory () -> page ( $page ) -> createQuietly ();
app ( ProcessCrawlJob :: class , [ 'pageCrawl' => $crawl ])
2026-04-27 01:25:46 +02:00
-> handle ();
2026-04-27 00:18:34 +02:00
$this -> assertDatabaseMissing ( 'pages' , [ 'url' => 'https://should-not-be-registered.com/page' ]);
$this -> assertSame ( 1 , Page :: count ());
}
2026-04-26 23:50:57 +02:00
public function test_handle_registers_outbound_links_on_success () : void
{
Queue :: fake ();
2026-04-27 00:24:38 +02:00
$this -> mockFetchPageAction (
CrawlOutcomeEnum :: Success ,
2026-04-26 23:50:57 +02:00
statusCode : 200 ,
finalUrl : 'https://source.com/article' ,
title : 'Source Article' ,
extractedText : 'some text' ,
2026-04-27 00:24:38 +02:00
outboundLinks : collect ([ 'https://other.com/article-1' , 'https://another.com/post-2' ]),
2026-04-26 23:50:57 +02:00
wordCount : 2 ,
2026-04-27 00:24:38 +02:00
);
2026-04-26 23:50:57 +02:00
$page = Page :: factory () -> createQuietly ([ 'url' => 'https://source.com/article' ]);
$crawl = PageCrawl :: factory () -> page ( $page ) -> createQuietly ();
app ( ProcessCrawlJob :: class , [ 'pageCrawl' => $crawl ])
2026-04-27 01:25:46 +02:00
-> handle ();
2026-04-26 23:50:57 +02:00
$this -> assertDatabaseHas ( 'pages' , [ 'url' => 'https://other.com/article-1' ]);
$this -> assertDatabaseHas ( 'pages' , [ 'url' => 'https://another.com/post-2' ]);
$this -> assertSame ( 3 , Page :: count ());
}
2026-04-27 00:24:38 +02:00
2026-04-27 01:25:46 +02:00
public function test_handle_releases_job_when_domain_is_locked () : void
{
Queue :: fake ();
// Pre-acquire the lock so the job sees it as already held
Cache :: lock ( 'crawler:domain:example.com' , 10 ) -> get ();
// The fetcher must NOT be called — the job should bail before reaching it
$fetcher = Mockery :: mock ( FetchPageAction :: class );
$fetcher -> shouldNotReceive ( '__invoke' );
$this -> app -> instance ( FetchPageAction :: class , $fetcher );
$page = Page :: factory () -> createQuietly ([ 'url' => 'https://example.com/article' ]);
$crawl = PageCrawl :: factory () -> page ( $page ) -> createQuietly ();
$job = new ProcessCrawlJob ( $crawl );
$job -> handle ();
// No outcome written — handle() returned early
$this -> assertNull ( $crawl -> fresh () -> outcome );
// Page status unchanged from its factory default (Discovered)
$this -> assertSame ( PageStatusEnum :: Discovered , $page -> fresh () -> status );
}
public function test_handle_does_not_release_lock_after_completion () : void
{
Queue :: fake ();
$this -> mockFetchPageAction ( CrawlOutcomeEnum :: Success , statusCode : 200 );
$page = Page :: factory () -> createQuietly ([ 'url' => 'https://example.com/article' ]);
$crawl = PageCrawl :: factory () -> page ( $page ) -> createQuietly ();
$job = new ProcessCrawlJob ( $crawl );
$job -> handle ();
// If handle() called $lock->release(), this second get() would succeed (true).
// It must fail (false) — the lock acquired inside handle() must still be held.
$result = Cache :: lock ( 'crawler:domain:example.com' , 10 ) -> get ();
$this -> assertFalse ( $result , 'Expected the domain lock to still be held after handle() completed, but it was free — the lock was released prematurely.' );
}
2026-04-27 23:53:52 +02:00
public function test_handle_writes_blocked_robots_when_disallowed () : void
{
Queue :: fake ();
Http :: fake ([
'https://example.com/robots.txt' => Http :: response (
" User-agent: * \n Disallow: / " ,
200 ,
),
]);
// FetchPageAction must never be called — the robots gate returns before the lock
$fetcher = Mockery :: mock ( FetchPageAction :: class );
$fetcher -> shouldNotReceive ( '__invoke' );
$this -> app -> instance ( FetchPageAction :: class , $fetcher );
$page = Page :: factory () -> createQuietly ([ 'url' => 'https://example.com/private' ]);
$crawl = PageCrawl :: factory () -> page ( $page ) -> createQuietly ();
$domain = $crawl -> domain ;
app ( ProcessCrawlJob :: class , [ 'pageCrawl' => $crawl ])
-> handle ();
// Outcome row must record BlockedRobots
$this -> assertDatabaseHas ( 'page_crawls' , [
'id' => $crawl -> id ,
'outcome' => CrawlOutcomeEnum :: BlockedRobots -> value ,
]);
// Page status must be Failed (BlockedRobots::toPageStatus() === Failed)
$this -> assertSame ( PageStatusEnum :: Failed , $page -> fresh () -> status );
// The politeness lock must still be acquirable — the gate returned before ever claiming it
$this -> assertTrue (
Cache :: lock ( " crawler:domain: { $domain } " , 10 ) -> get (),
'Expected the politeness lock to be free (gate returned before acquiring it), but it was already held.' ,
);
}
2026-04-27 01:25:46 +02:00
public function test_handle_acquires_domain_lock_before_fetching () : void
{
Queue :: fake ();
$this -> mockFetchPageAction ( CrawlOutcomeEnum :: Success , statusCode : 200 );
$page = Page :: factory () -> createQuietly ([ 'url' => 'https://lock-test.example.com/article' ]);
$crawl = PageCrawl :: factory () -> page ( $page ) -> createQuietly ();
$domain = $crawl -> domain ;
app ( ProcessCrawlJob :: class , [ 'pageCrawl' => $crawl ])
-> handle ();
// The lock must still be held after handle() completes — a second attempt to acquire it fails
$this -> assertFalse (
Cache :: lock ( " crawler:domain: { $domain } " , 10 ) -> get (),
'Expected the domain lock to still be held after handle() ran, but it was free.' ,
);
// The fetch ran — outcome was written (proves the lock did not block execution)
$this -> assertSame ( CrawlOutcomeEnum :: Success , $crawl -> fresh () -> outcome );
}
2026-04-27 23:53:52 +02:00
public function test_handle_proceeds_through_politeness_lock_when_robots_allow () : void
{
Queue :: fake ();
Http :: fake ([
'https://example.com/robots.txt' => Http :: response (
" User-agent: * \n Allow: / " ,
200 ,
),
]);
// FetchPageAction must be called exactly once — robots gate passed, fetch proceeds
$fetcher = Mockery :: mock ( FetchPageAction :: class );
$fetcher -> shouldReceive ( '__invoke' ) -> once () -> andReturn ( new FetchResult (
outcome : CrawlOutcomeEnum :: Success ,
statusCode : 200 ,
finalUrl : 'https://example.com/article' ,
title : 'Hello' ,
extractedText : 'hi' ,
outboundLinks : collect (),
wordCount : 1 ,
errorMessage : null ,
));
$this -> app -> instance ( FetchPageAction :: class , $fetcher );
$page = Page :: factory () -> createQuietly ([ 'url' => 'https://example.com/article' ]);
$crawl = PageCrawl :: factory () -> page ( $page ) -> createQuietly ();
$domain = $crawl -> domain ;
app ( ProcessCrawlJob :: class , [ 'pageCrawl' => $crawl ])
-> handle ();
// Outcome must be Success — not BlockedRobots
$this -> assertDatabaseHas ( 'page_crawls' , [
'id' => $crawl -> id ,
'outcome' => CrawlOutcomeEnum :: Success -> value ,
]);
// Page status must have advanced to Fetched
$this -> assertSame ( PageStatusEnum :: Fetched , $page -> fresh () -> status );
// Politeness lock must still be held (claimed during the fetch, never released)
$this -> assertFalse (
Cache :: lock ( " crawler:domain: { $domain } " , 10 ) -> get (),
'Expected the politeness lock to be held after a successful fetch, but it was free.' ,
);
}
2026-04-27 00:24:38 +02:00
private function mockFetchPageAction (
CrawlOutcomeEnum $outcome ,
? int $statusCode = null ,
? string $finalUrl = 'https://example.com/article' ,
? string $title = null ,
? string $extractedText = null ,
? Collection $outboundLinks = null ,
? int $wordCount = null ,
? string $errorMessage = null ,
) : void {
$fetcher = Mockery :: mock ( FetchPageAction :: class );
$fetcher -> shouldReceive ( '__invoke' ) -> andReturn ( new FetchResult (
outcome : $outcome ,
statusCode : $statusCode ,
finalUrl : $finalUrl ,
title : $title ,
extractedText : $extractedText ,
outboundLinks : $outboundLinks ? ? collect (),
wordCount : $wordCount ,
errorMessage : $errorMessage ,
));
$this -> app -> instance ( FetchPageAction :: class , $fetcher );
}
2026-04-26 21:15:07 +02:00
}