createQuietly(['url' => 'https://example.com/article']); PageCrawl::factory()->page($page)->create(); Queue::assertPushed(ProcessCrawlJob::class); } public function test_dispatched_job_carries_the_correct_page_crawl(): void { Queue::fake(); $page = Page::factory()->createQuietly(['url' => 'https://example.com/article']); $crawl = PageCrawl::factory()->page($page)->create(); Queue::assertPushed( ProcessCrawlJob::class, fn (ProcessCrawlJob $job) => $job->pageCrawl->id === $crawl->id, ); } public function test_handle_writes_outcome_to_page_crawl_on_success(): void { Queue::fake(); $this->mockFetchPageAction(CrawlOutcomeEnum::Success, statusCode: 200, title: 'Hello', extractedText: 'hi', wordCount: 1); $page = Page::factory()->createQuietly(['url' => 'https://example.com/article']); $crawl = PageCrawl::factory()->page($page)->createQuietly(); app(ProcessCrawlJob::class, ['pageCrawl' => $crawl]) ->handle(); $fresh = $crawl->fresh(); $this->assertSame(CrawlOutcomeEnum::Success, $fresh->outcome); $this->assertNotNull($fresh->completed_at); $this->assertInstanceOf(Carbon::class, $fresh->completed_at); $this->assertSame(200, $fresh->status_code); $this->assertNull($fresh->error_message); } public function test_handle_updates_page_to_fetched_on_success(): void { Queue::fake(); $this->mockFetchPageAction(CrawlOutcomeEnum::Success, statusCode: 200, title: 'Hello', extractedText: 'hi', wordCount: 1); $page = Page::factory()->createQuietly(['url' => 'https://example.com/article']); $crawl = PageCrawl::factory()->page($page)->createQuietly(); app(ProcessCrawlJob::class, ['pageCrawl' => $crawl]) ->handle(); $fresh = $page->fresh(); $this->assertSame(PageStatusEnum::Fetched, $fresh->status); $this->assertNotNull($fresh->fetched_at); $this->assertInstanceOf(Carbon::class, $fresh->fetched_at); $this->assertSame('Hello', $fresh->title); } public function test_handle_updates_page_to_rejected_on_rejected_outcome(): void { Queue::fake(); $this->mockFetchPageAction(CrawlOutcomeEnum::Rejected, statusCode: 200, errorMessage: 'Unsupported Content-Type: application/pdf'); $page = Page::factory()->createQuietly(['url' => 'https://example.com/brochure.pdf']); $crawl = PageCrawl::factory()->page($page)->createQuietly(); app(ProcessCrawlJob::class, ['pageCrawl' => $crawl]) ->handle(); $fresh = $page->fresh(); $this->assertSame(PageStatusEnum::Rejected, $fresh->status); $this->assertNull($fresh->fetched_at); } public function test_handle_updates_page_to_failed_on_blocked_4xx(): void { Queue::fake(); $this->mockFetchPageAction(CrawlOutcomeEnum::Blocked4xx, statusCode: 404, errorMessage: 'HTTP 404'); $page = Page::factory()->createQuietly(['url' => 'https://example.com/gone']); $crawl = PageCrawl::factory()->page($page)->createQuietly(); app(ProcessCrawlJob::class, ['pageCrawl' => $crawl]) ->handle(); $fresh = $page->fresh(); $this->assertSame(PageStatusEnum::Failed, $fresh->status); $this->assertNotNull($fresh->failed_at); $this->assertInstanceOf(Carbon::class, $fresh->failed_at); } public function test_handle_updates_page_to_failed_on_timeout(): void { Queue::fake(); $this->mockFetchPageAction(CrawlOutcomeEnum::Timeout, errorMessage: 'Connection timed out after 10 seconds'); $page = Page::factory()->createQuietly(['url' => 'https://example.com/slow']); $crawl = PageCrawl::factory()->page($page)->createQuietly(); app(ProcessCrawlJob::class, ['pageCrawl' => $crawl]) ->handle(); $fresh = $page->fresh(); $this->assertSame(PageStatusEnum::Failed, $fresh->status); $this->assertNotNull($fresh->failed_at); $this->assertInstanceOf(Carbon::class, $fresh->failed_at); } public function test_handle_schedules_retry_on_transient_failure(): void { Queue::fake(); $this->mockFetchPageAction(CrawlOutcomeEnum::Failed, errorMessage: 'Connection refused'); $page = Page::factory()->createQuietly(['url' => 'https://example.com/unstable']); $crawl = PageCrawl::factory()->page($page)->createQuietly(); app(ProcessCrawlJob::class, ['pageCrawl' => $crawl]) ->handle(); // A second PageCrawl row (the retry) must have been inserted for the same page $this->assertSame(2, PageCrawl::where('page_id', $page->id)->count()); // The new row is pending — outcome IS NULL $retryRow = PageCrawl::where('page_id', $page->id) ->whereNull('outcome') ->first(); $this->assertNotNull($retryRow); // A delayed ProcessCrawlJob must have been pushed for the retry row Queue::assertPushed( ProcessCrawlJob::class, fn (ProcessCrawlJob $job) => $job->pageCrawl->page_id === $page->id && $job->pageCrawl->id === $retryRow->id, ); } public function test_handle_does_not_retry_after_three_attempts(): void { Queue::fake(); $this->mockFetchPageAction(CrawlOutcomeEnum::Failed, errorMessage: 'Connection refused'); $page = Page::factory()->createQuietly(['url' => 'https://example.com/unreachable']); // 3 prior attempts already exist — this is the cap PageCrawl::factory()->page($page)->failed('Connection refused')->createQuietly(); PageCrawl::factory()->page($page)->failed('Connection refused')->createQuietly(); $thirdCrawl = PageCrawl::factory()->page($page)->createQuietly(); app(ProcessCrawlJob::class, ['pageCrawl' => $thirdCrawl]) ->handle(); // No 4th row must appear — retry cap reached $this->assertSame(3, PageCrawl::where('page_id', $page->id)->count()); // No retry job dispatched Queue::assertNotPushed(ProcessCrawlJob::class); } public function test_handle_writes_failed_outcome_to_page_crawl(): void { Queue::fake(); $this->mockFetchPageAction(CrawlOutcomeEnum::Failed, errorMessage: 'boom'); $page = Page::factory()->createQuietly(['url' => 'https://example.com/unstable']); $crawl = PageCrawl::factory()->page($page)->createQuietly(); app(ProcessCrawlJob::class, ['pageCrawl' => $crawl]) ->handle(); $this->assertDatabaseHas('page_crawls', [ 'id' => $crawl->id, 'outcome' => CrawlOutcomeEnum::Failed->value, 'status_code' => null, 'error_message' => 'boom', ]); } public function test_handle_updates_page_to_failed_on_failed_outcome(): void { Queue::fake(); $this->mockFetchPageAction(CrawlOutcomeEnum::Failed, errorMessage: 'Connection refused'); $page = Page::factory()->createQuietly(['url' => 'https://example.com/unreachable']); $crawl = PageCrawl::factory()->page($page)->createQuietly(); app(ProcessCrawlJob::class, ['pageCrawl' => $crawl]) ->handle(); $this->assertSame(PageStatusEnum::Failed, $page->fresh()->status); } public function test_handle_updates_page_to_failed_on_blocked_5xx(): void { Queue::fake(); $this->mockFetchPageAction(CrawlOutcomeEnum::Blocked5xx, statusCode: 503, errorMessage: 'HTTP 503'); $page = Page::factory()->createQuietly(['url' => 'https://example.com/overloaded']); $crawl = PageCrawl::factory()->page($page)->createQuietly(); app(ProcessCrawlJob::class, ['pageCrawl' => $crawl]) ->handle(); $this->assertSame(PageStatusEnum::Failed, $page->fresh()->status); } public function test_handle_updates_page_to_failed_on_blocked_robots(): void { Queue::fake(); $this->mockFetchPageAction(CrawlOutcomeEnum::BlockedRobots, errorMessage: 'Disallowed by robots.txt'); $page = Page::factory()->createQuietly(['url' => 'https://example.com/private']); $crawl = PageCrawl::factory()->page($page)->createQuietly(); app(ProcessCrawlJob::class, ['pageCrawl' => $crawl]) ->handle(); $this->assertSame(PageStatusEnum::Failed, $page->fresh()->status); } public function test_handle_does_not_register_outbound_links_on_failure(): void { Queue::fake(); $this->mockFetchPageAction( CrawlOutcomeEnum::Failed, outboundLinks: collect(['https://should-not-be-registered.com/page']), errorMessage: 'Connection refused', ); $page = Page::factory()->createQuietly(['url' => 'https://example.com/broken']); $crawl = PageCrawl::factory()->page($page)->createQuietly(); app(ProcessCrawlJob::class, ['pageCrawl' => $crawl]) ->handle(); $this->assertDatabaseMissing('pages', ['url' => 'https://should-not-be-registered.com/page']); $this->assertSame(1, Page::count()); } public function test_handle_registers_outbound_links_on_success(): void { Queue::fake(); $this->mockFetchPageAction( CrawlOutcomeEnum::Success, statusCode: 200, finalUrl: 'https://source.com/article', title: 'Source Article', extractedText: 'some text', outboundLinks: collect(['https://other.com/article-1', 'https://another.com/post-2']), wordCount: 2, ); $page = Page::factory()->createQuietly(['url' => 'https://source.com/article']); $crawl = PageCrawl::factory()->page($page)->createQuietly(); app(ProcessCrawlJob::class, ['pageCrawl' => $crawl]) ->handle(); $this->assertDatabaseHas('pages', ['url' => 'https://other.com/article-1']); $this->assertDatabaseHas('pages', ['url' => 'https://another.com/post-2']); $this->assertSame(3, Page::count()); } public function test_handle_releases_job_when_domain_is_locked(): void { Queue::fake(); // Pre-acquire the lock so the job sees it as already held Cache::lock('crawler:domain:example.com', 10)->get(); // The fetcher must NOT be called — the job should bail before reaching it $fetcher = Mockery::mock(FetchPageAction::class); $fetcher->shouldNotReceive('__invoke'); $this->app->instance(FetchPageAction::class, $fetcher); $page = Page::factory()->createQuietly(['url' => 'https://example.com/article']); $crawl = PageCrawl::factory()->page($page)->createQuietly(); $job = new ProcessCrawlJob($crawl); $job->handle(); // No outcome written — handle() returned early $this->assertNull($crawl->fresh()->outcome); // Page status unchanged from its factory default (Discovered) $this->assertSame(PageStatusEnum::Discovered, $page->fresh()->status); } public function test_handle_does_not_release_lock_after_completion(): void { Queue::fake(); $this->mockFetchPageAction(CrawlOutcomeEnum::Success, statusCode: 200); $page = Page::factory()->createQuietly(['url' => 'https://example.com/article']); $crawl = PageCrawl::factory()->page($page)->createQuietly(); $job = new ProcessCrawlJob($crawl); $job->handle(); // If handle() called $lock->release(), this second get() would succeed (true). // It must fail (false) — the lock acquired inside handle() must still be held. $result = Cache::lock('crawler:domain:example.com', 10)->get(); $this->assertFalse($result, 'Expected the domain lock to still be held after handle() completed, but it was free — the lock was released prematurely.'); } public function test_handle_writes_blocked_robots_when_disallowed(): void { Queue::fake(); Http::fake([ 'https://example.com/robots.txt' => Http::response( "User-agent: *\nDisallow: /", 200, ), ]); // FetchPageAction must never be called — the robots gate returns before the lock $fetcher = Mockery::mock(FetchPageAction::class); $fetcher->shouldNotReceive('__invoke'); $this->app->instance(FetchPageAction::class, $fetcher); $page = Page::factory()->createQuietly(['url' => 'https://example.com/private']); $crawl = PageCrawl::factory()->page($page)->createQuietly(); $domain = $crawl->domain; app(ProcessCrawlJob::class, ['pageCrawl' => $crawl]) ->handle(); // Outcome row must record BlockedRobots $this->assertDatabaseHas('page_crawls', [ 'id' => $crawl->id, 'outcome' => CrawlOutcomeEnum::BlockedRobots->value, ]); // Page status must be Failed (BlockedRobots::toPageStatus() === Failed) $this->assertSame(PageStatusEnum::Failed, $page->fresh()->status); // The politeness lock must still be acquirable — the gate returned before ever claiming it $this->assertTrue( Cache::lock("crawler:domain:{$domain}", 10)->get(), 'Expected the politeness lock to be free (gate returned before acquiring it), but it was already held.', ); } public function test_handle_acquires_domain_lock_before_fetching(): void { Queue::fake(); $this->mockFetchPageAction(CrawlOutcomeEnum::Success, statusCode: 200); $page = Page::factory()->createQuietly(['url' => 'https://lock-test.example.com/article']); $crawl = PageCrawl::factory()->page($page)->createQuietly(); $domain = $crawl->domain; app(ProcessCrawlJob::class, ['pageCrawl' => $crawl]) ->handle(); // The lock must still be held after handle() completes — a second attempt to acquire it fails $this->assertFalse( Cache::lock("crawler:domain:{$domain}", 10)->get(), 'Expected the domain lock to still be held after handle() ran, but it was free.', ); // The fetch ran — outcome was written (proves the lock did not block execution) $this->assertSame(CrawlOutcomeEnum::Success, $crawl->fresh()->outcome); } public function test_handle_proceeds_through_politeness_lock_when_robots_allow(): void { Queue::fake(); Http::fake([ 'https://example.com/robots.txt' => Http::response( "User-agent: *\nAllow: /", 200, ), ]); // FetchPageAction must be called exactly once — robots gate passed, fetch proceeds $fetcher = Mockery::mock(FetchPageAction::class); $fetcher->shouldReceive('__invoke')->once()->andReturn(new FetchResult( outcome: CrawlOutcomeEnum::Success, statusCode: 200, finalUrl: 'https://example.com/article', title: 'Hello', extractedText: 'hi', outboundLinks: collect(), wordCount: 1, errorMessage: null, )); $this->app->instance(FetchPageAction::class, $fetcher); $page = Page::factory()->createQuietly(['url' => 'https://example.com/article']); $crawl = PageCrawl::factory()->page($page)->createQuietly(); $domain = $crawl->domain; app(ProcessCrawlJob::class, ['pageCrawl' => $crawl]) ->handle(); // Outcome must be Success — not BlockedRobots $this->assertDatabaseHas('page_crawls', [ 'id' => $crawl->id, 'outcome' => CrawlOutcomeEnum::Success->value, ]); // Page status must have advanced to Fetched $this->assertSame(PageStatusEnum::Fetched, $page->fresh()->status); // Politeness lock must still be held (claimed during the fetch, never released) $this->assertFalse( Cache::lock("crawler:domain:{$domain}", 10)->get(), 'Expected the politeness lock to be held after a successful fetch, but it was free.', ); } public function test_handle_persists_language_on_success(): void { Queue::fake(); $this->mockFetchPageAction( CrawlOutcomeEnum::Success, statusCode: 200, title: 'Hello', extractedText: 'hi', wordCount: 1, language: 'en', languageConfidence: 0.95, ); $page = Page::factory()->createQuietly(['url' => 'https://example.com/article']); $crawl = PageCrawl::factory()->page($page)->createQuietly(); app(ProcessCrawlJob::class, ['pageCrawl' => $crawl]) ->handle(); $fresh = $page->fresh(); $this->assertSame('en', $fresh->language); $this->assertEqualsWithDelta(0.95, $fresh->language_confidence, 0.001); } public function test_handle_does_not_overwrite_existing_language_when_new_fetch_returns_null(): void { Queue::fake(); $this->mockFetchPageAction( CrawlOutcomeEnum::Success, statusCode: 200, title: 'Hello', extractedText: 'hi', wordCount: 1, language: null, languageConfidence: null, ); // Page already has a language from a previous fetch $page = Page::factory()->createQuietly([ 'url' => 'https://example.com/article', 'language' => 'en', 'language_confidence' => 0.95, ]); $crawl = PageCrawl::factory()->page($page)->createQuietly(); app(ProcessCrawlJob::class, ['pageCrawl' => $crawl]) ->handle(); $fresh = $page->fresh(); // Language columns must be sticky — null detection must NOT overwrite them $this->assertSame('en', $fresh->language); $this->assertEqualsWithDelta(0.95, $fresh->language_confidence, 0.001); // Other columns must still update — sticky applies to language only $this->assertSame(PageStatusEnum::Fetched, $fresh->status); $this->assertSame('Hello', $fresh->title); } public function test_handle_leaves_language_null_when_no_prior_and_no_detection(): void { Queue::fake(); $this->mockFetchPageAction( CrawlOutcomeEnum::Success, statusCode: 200, title: 'Hello', extractedText: 'hi', wordCount: 1, language: null, languageConfidence: null, ); $page = Page::factory()->createQuietly(['url' => 'https://example.com/article']); $crawl = PageCrawl::factory()->page($page)->createQuietly(); app(ProcessCrawlJob::class, ['pageCrawl' => $crawl]) ->handle(); $fresh = $page->fresh(); $this->assertNull($fresh->language); $this->assertNull($fresh->language_confidence); } private function mockFetchPageAction( CrawlOutcomeEnum $outcome, ?int $statusCode = null, ?string $finalUrl = 'https://example.com/article', ?string $title = null, ?string $extractedText = null, ?Collection $outboundLinks = null, ?int $wordCount = null, ?string $errorMessage = null, ?string $language = null, ?float $languageConfidence = null, ): void { $fetcher = Mockery::mock(FetchPageAction::class); $fetcher->shouldReceive('__invoke')->andReturn(new FetchResult( outcome: $outcome, statusCode: $statusCode, finalUrl: $finalUrl, title: $title, extractedText: $extractedText, outboundLinks: $outboundLinks ?? collect(), wordCount: $wordCount, errorMessage: $errorMessage, language: $language, languageConfidence: $languageConfidence, )); $this->app->instance(FetchPageAction::class, $fetcher); } }