Http::response( 'Hello', 200, ['Content-Type' => 'text/html'], ), ]); $result = $this->makeAction()('https://example.com/page'); $this->assertInstanceOf(FetchResult::class, $result); $this->assertSame(CrawlOutcomeEnum::Success, $result->outcome); $this->assertSame(200, $result->statusCode); $this->assertNotNull($result->finalUrl); } public function test_4xx_response_returns_blocked_4xx(): void { Http::fake([ 'example.com/*' => Http::response('Not Found', 404), ]); $result = $this->makeAction()('https://example.com/missing'); $this->assertInstanceOf(FetchResult::class, $result); $this->assertSame(CrawlOutcomeEnum::Blocked4xx, $result->outcome); $this->assertSame(404, $result->statusCode); $this->assertIsString($result->errorMessage); $this->assertStringContainsString('404', $result->errorMessage); $this->assertNotNull($result->finalUrl); } public function test_5xx_response_returns_blocked_5xx(): void { Http::fake([ 'example.com/*' => Http::response('Service Unavailable', 503), ]); $result = $this->makeAction()('https://example.com/page'); $this->assertInstanceOf(FetchResult::class, $result); $this->assertSame(CrawlOutcomeEnum::Blocked5xx, $result->outcome); $this->assertSame(503, $result->statusCode); $this->assertIsString($result->errorMessage); $this->assertStringContainsString('503', $result->errorMessage); $this->assertNotNull($result->finalUrl); } public function test_non_html_content_type_returns_rejected(): void { Http::fake([ 'example.com/*' => Http::response( 'PDF binary stuff', 200, ['Content-Type' => 'application/pdf'], ), ]); $result = $this->makeAction()('https://example.com/document.pdf'); $this->assertInstanceOf(FetchResult::class, $result); $this->assertSame(CrawlOutcomeEnum::Rejected, $result->outcome); $this->assertSame(200, $result->statusCode); $this->assertIsString($result->errorMessage); $this->assertStringContainsString('application/pdf', $result->errorMessage); $this->assertNotNull($result->finalUrl); $this->assertNull($result->title); $this->assertNull($result->extractedText); $this->assertEmpty($result->outboundLinks); $this->assertNull($result->wordCount); } public function test_text_html_with_charset_is_accepted(): void { Http::fake([ 'example.com/*' => Http::response( 'Hello charset world', 200, ['Content-Type' => 'text/html; charset=utf-8'], ), ]); $result = $this->makeAction()('https://example.com/page'); $this->assertInstanceOf(FetchResult::class, $result); $this->assertSame(CrawlOutcomeEnum::Success, $result->outcome); $this->assertSame(200, $result->statusCode); } public function test_connection_failure_returns_failed(): void { Http::fake(function () { throw new ConnectException( 'Could not resolve host', new Request('GET', 'https://example.com/page'), null, ['errno' => 6], ); }); $result = $this->makeAction()('https://example.com/page'); $this->assertInstanceOf(FetchResult::class, $result); $this->assertSame(CrawlOutcomeEnum::Failed, $result->outcome); $this->assertNull($result->statusCode); $this->assertNull($result->finalUrl); $this->assertIsString($result->errorMessage); $this->assertNull($result->title); $this->assertNull($result->extractedText); $this->assertEmpty($result->outboundLinks); $this->assertNull($result->wordCount); } public function test_timeout_returns_timeout(): void { Http::fake(function () { throw new ConnectException( 'cURL error 28: Operation timed out', new Request('GET', 'https://example.com/page'), null, ['errno' => 28], ); }); $result = $this->makeAction()('https://example.com/page'); $this->assertInstanceOf(FetchResult::class, $result); $this->assertSame(CrawlOutcomeEnum::Timeout, $result->outcome); $this->assertNull($result->statusCode); $this->assertNull($result->finalUrl); $this->assertIsString($result->errorMessage); } public function test_success_extracts_title_from_html(): void { Http::fake([ 'example.com/*' => Http::response( 'My Page Title

Some content.

', 200, ['Content-Type' => 'text/html'], ), ]); $result = $this->makeAction()('https://example.com/page'); $this->assertSame(CrawlOutcomeEnum::Success, $result->outcome); $this->assertSame('My Page Title', $result->title); } public function test_success_extracts_main_text(): void { $html = <<<'HTML' Article Title

The Real Article

This is the main article body that should be extracted by readability.

Multiple paragraphs prove the extractor works on the full content.

Article Title

This article references an external article.

And a relative link to a related post on the same site.

Plus a private IP link that should be rejected.

And a credentials URL that should be rejected.

And a non-http scheme that should be rejected.

HTML; Http::fake([ 'example.com/*' => Http::response($html, 200, ['Content-Type' => 'text/html']), ]); $result = $this->makeAction()('https://example.com/article'); $this->assertSame(CrawlOutcomeEnum::Success, $result->outcome); $this->assertInstanceOf(Collection::class, $result->outboundLinks); $this->assertSame(2, $result->outboundLinks->count()); $this->assertContains('https://other.com/article', $result->outboundLinks->all()); $this->assertContains('https://example.com/related-post', $result->outboundLinks->all()); $this->assertNotContains('http://192.168.1.1/admin', $result->outboundLinks->all()); $this->assertNotContains('https://user:pass@evil.com/', $result->outboundLinks->all()); $this->assertNotContains('ftp://files.example.com/', $result->outboundLinks->all()); } public function test_success_calculates_word_count(): void { $html = <<<'HTML' Word Count Test

This article body has exactly nine words total here.

Content here.

', 200, ['Content-Type' => 'Text/HTML; charset=utf-8'], ), ]); $result = $this->makeAction()('https://example.com/page'); $this->assertSame(CrawlOutcomeEnum::Success, $result->outcome); } public function test_empty_href_is_filtered_from_outbound_links(): void { $html = <<<'HTML' Empty Href Test

This paragraph has an empty href anchor that should be dropped.

Jump to section 2 of this page.