Http::response( 'Hello', 200, ['Content-Type' => 'text/html'], ), ]); $result = $this->makeAction()('https://example.com/page'); $this->assertInstanceOf(FetchResult::class, $result); $this->assertSame(CrawlOutcomeEnum::Success, $result->outcome); $this->assertSame(200, $result->statusCode); $this->assertNotNull($result->finalUrl); } public function test_4xx_response_returns_blocked_4xx(): void { Http::fake([ 'example.com/*' => Http::response('Not Found', 404), ]); $result = $this->makeAction()('https://example.com/missing'); $this->assertInstanceOf(FetchResult::class, $result); $this->assertSame(CrawlOutcomeEnum::Blocked4xx, $result->outcome); $this->assertSame(404, $result->statusCode); $this->assertIsString($result->errorMessage); $this->assertStringContainsString('404', $result->errorMessage); $this->assertNotNull($result->finalUrl); } public function test_5xx_response_returns_blocked_5xx(): void { Http::fake([ 'example.com/*' => Http::response('Service Unavailable', 503), ]); $result = $this->makeAction()('https://example.com/page'); $this->assertInstanceOf(FetchResult::class, $result); $this->assertSame(CrawlOutcomeEnum::Blocked5xx, $result->outcome); $this->assertSame(503, $result->statusCode); $this->assertIsString($result->errorMessage); $this->assertStringContainsString('503', $result->errorMessage); $this->assertNotNull($result->finalUrl); } public function test_non_html_content_type_returns_rejected(): void { Http::fake([ 'example.com/*' => Http::response( 'PDF binary stuff', 200, ['Content-Type' => 'application/pdf'], ), ]); $result = $this->makeAction()('https://example.com/document.pdf'); $this->assertInstanceOf(FetchResult::class, $result); $this->assertSame(CrawlOutcomeEnum::Rejected, $result->outcome); $this->assertSame(200, $result->statusCode); $this->assertIsString($result->errorMessage); $this->assertStringContainsString('application/pdf', $result->errorMessage); $this->assertNotNull($result->finalUrl); $this->assertNull($result->title); $this->assertNull($result->extractedText); $this->assertEmpty($result->outboundLinks); $this->assertNull($result->wordCount); } public function test_text_html_with_charset_is_accepted(): void { Http::fake([ 'example.com/*' => Http::response( 'Hello charset world', 200, ['Content-Type' => 'text/html; charset=utf-8'], ), ]); $result = $this->makeAction()('https://example.com/page'); $this->assertInstanceOf(FetchResult::class, $result); $this->assertSame(CrawlOutcomeEnum::Success, $result->outcome); $this->assertSame(200, $result->statusCode); } public function test_connection_failure_returns_failed(): void { Http::fake(function () { throw new ConnectException( 'Could not resolve host', new Request('GET', 'https://example.com/page'), null, ['errno' => 6], ); }); $result = $this->makeAction()('https://example.com/page'); $this->assertInstanceOf(FetchResult::class, $result); $this->assertSame(CrawlOutcomeEnum::Failed, $result->outcome); $this->assertNull($result->statusCode); $this->assertNull($result->finalUrl); $this->assertIsString($result->errorMessage); $this->assertNull($result->title); $this->assertNull($result->extractedText); $this->assertEmpty($result->outboundLinks); $this->assertNull($result->wordCount); } public function test_timeout_returns_timeout(): void { Http::fake(function () { throw new ConnectException( 'cURL error 28: Operation timed out', new Request('GET', 'https://example.com/page'), null, ['errno' => 28], ); }); $result = $this->makeAction()('https://example.com/page'); $this->assertInstanceOf(FetchResult::class, $result); $this->assertSame(CrawlOutcomeEnum::Timeout, $result->outcome); $this->assertNull($result->statusCode); $this->assertNull($result->finalUrl); $this->assertIsString($result->errorMessage); } public function test_success_extracts_title_from_html(): void { Http::fake([ 'example.com/*' => Http::response( 'My Page Title

Some content.

', 200, ['Content-Type' => 'text/html'], ), ]); $result = $this->makeAction()('https://example.com/page'); $this->assertSame(CrawlOutcomeEnum::Success, $result->outcome); $this->assertSame('My Page Title', $result->title); } public function test_success_extracts_main_text(): void { $html = <<<'HTML' Article Title

The Real Article

This is the main article body that should be extracted by readability.

Multiple paragraphs prove the extractor works on the full content.

HTML; Http::fake([ 'example.com/*' => Http::response($html, 200, ['Content-Type' => 'text/html']), ]); $result = $this->makeAction()('https://example.com/article'); $this->assertSame(CrawlOutcomeEnum::Success, $result->outcome); $this->assertNotNull($result->extractedText); $this->assertStringContainsString('main article body', $result->extractedText); } public function test_success_extracts_and_filters_outbound_links(): void { $html = <<<'HTML' Article With Links

Article Title

This article references an external article.

And a relative link to a related post on the same site.

Plus a private IP link that should be rejected.

And a credentials URL that should be rejected.

And a non-http scheme that should be rejected.

HTML; Http::fake([ 'example.com/*' => Http::response($html, 200, ['Content-Type' => 'text/html']), ]); $result = $this->makeAction()('https://example.com/article'); $this->assertSame(CrawlOutcomeEnum::Success, $result->outcome); $this->assertInstanceOf(Collection::class, $result->outboundLinks); $this->assertSame(2, $result->outboundLinks->count()); $this->assertContains('https://other.com/article', $result->outboundLinks->all()); $this->assertContains('https://example.com/related-post', $result->outboundLinks->all()); $this->assertNotContains('http://192.168.1.1/admin', $result->outboundLinks->all()); $this->assertNotContains('https://user:pass@evil.com/', $result->outboundLinks->all()); $this->assertNotContains('ftp://files.example.com/', $result->outboundLinks->all()); } public function test_success_calculates_word_count(): void { $html = <<<'HTML' Word Count Test

This article body has exactly nine words total here.

HTML; Http::fake([ 'example.com/*' => Http::response($html, 200, ['Content-Type' => 'text/html']), ]); $result = $this->makeAction()('https://example.com/article'); $this->assertSame(CrawlOutcomeEnum::Success, $result->outcome); $this->assertSame(9, $result->wordCount); } public function test_uppercase_content_type_is_accepted_as_html(): void { Http::fake([ 'example.com/*' => Http::response( 'Uppercase CT

Content here.

', 200, ['Content-Type' => 'Text/HTML; charset=utf-8'], ), ]); $result = $this->makeAction()('https://example.com/page'); $this->assertSame(CrawlOutcomeEnum::Success, $result->outcome); } public function test_empty_href_is_filtered_from_outbound_links(): void { $html = <<<'HTML' Empty Href Test

This paragraph has an empty href anchor that should be dropped.

HTML; Http::fake([ 'example.com/*' => Http::response($html, 200, ['Content-Type' => 'text/html']), ]); $result = $this->makeAction()('https://example.com/article'); $this->assertSame(CrawlOutcomeEnum::Success, $result->outcome); $this->assertSame(0, $result->outboundLinks->count()); } public function test_fragment_only_href_is_filtered_from_outbound_links(): void { $html = <<<'HTML' Fragment Href Test

Jump to section 2 of this page.

HTML; Http::fake([ 'example.com/*' => Http::response($html, 200, ['Content-Type' => 'text/html']), ]); $result = $this->makeAction()('https://example.com/article'); $this->assertSame(CrawlOutcomeEnum::Success, $result->outcome); $this->assertSame(0, $result->outboundLinks->count()); } private function makeAction(): FetchPageAction { return app(FetchPageAction::class); } }