From 1b7fbbfd0c9605a24d325de9b563dda2a6bc556d Mon Sep 17 00:00:00 2001 From: myrmidex Date: Sun, 26 Apr 2026 17:56:13 +0200 Subject: [PATCH] 12 - Add FetchPageAction with Http::fake-driven outcome paths --- app/Actions/FetchPageAction.php | 98 +++++++++++ tests/Feature/Actions/FetchPageActionTest.php | 156 ++++++++++++++++++ 2 files changed, 254 insertions(+) create mode 100644 app/Actions/FetchPageAction.php create mode 100644 tests/Feature/Actions/FetchPageActionTest.php diff --git a/app/Actions/FetchPageAction.php b/app/Actions/FetchPageAction.php new file mode 100644 index 0000000..9a69018 --- /dev/null +++ b/app/Actions/FetchPageAction.php @@ -0,0 +1,98 @@ +http + ->timeout(config('crawler.timeout')) + ->withHeaders([ + 'User-Agent' => config('crawler.user_agent'), + 'Accept' => 'text/html', + ]) + ->withOptions([ + 'allow_redirects' => ['max' => config('crawler.max_redirects')], + ]) + ->get($url); + + } catch (ConnectionException|ConnectException $e) { + return $this->failureResult($e); + } + + [$outcome, $error] = $this->validateResponse($response); + + return new FetchResult( + outcome: $outcome, + statusCode: $response->status(), + finalUrl: $url, + title: null, + extractedText: null, + outboundLinks: collect(), + wordCount: null, + errorMessage: $error ?? null, + ); + } + + private function validateResponse(Response $response): array + { + $status = $response->status(); + $statusStart = substr((string) $status, 0, 1); + + if ($statusStart === '4') { + return [CrawlOutcomeEnum::Blocked4xx, "HTTP {$status}"]; + } + + if (str_starts_with((string) $status, '5')) { + return [CrawlOutcomeEnum::Blocked5xx, "HTTP {$status}"]; + } + + $contentType = $response->header('Content-Type'); + if (! str_starts_with($contentType, 'text/html')) { + return [CrawlOutcomeEnum::Rejected, "Unsupported file type: {$contentType}"]; + } + + return [CrawlOutcomeEnum::Success, null]; + } + + private function failureResult(ConnectionException|ConnectException $e): FetchResult + { + $guzzleException = $e instanceof ConnectException + ? $e + : ($e->getPrevious() instanceof ConnectException + ? $e->getPrevious() + : null); + + $errno = $guzzleException?->getHandlerContext()['errno'] ?? null; + + $outcome = $errno === CURLE_OPERATION_TIMEDOUT + ? CrawlOutcomeEnum::Timeout + : CrawlOutcomeEnum::Failed; + + return new FetchResult( + outcome: $outcome, + statusCode: null, + finalUrl: null, + title: null, + extractedText: null, + outboundLinks: collect(), + wordCount: null, + errorMessage: $e->getMessage(), + ); + } +} diff --git a/tests/Feature/Actions/FetchPageActionTest.php b/tests/Feature/Actions/FetchPageActionTest.php new file mode 100644 index 0000000..1b399a8 --- /dev/null +++ b/tests/Feature/Actions/FetchPageActionTest.php @@ -0,0 +1,156 @@ + Http::response( + 'Hello', + 200, + ['Content-Type' => 'text/html'], + ), + ]); + + $result = $this->makeAction()('https://example.com/page'); + + $this->assertInstanceOf(FetchResult::class, $result); + $this->assertSame(CrawlOutcomeEnum::Success, $result->outcome); + $this->assertSame(200, $result->statusCode); + $this->assertNotNull($result->finalUrl); + } + + public function test_4xx_response_returns_blocked_4xx(): void + { + Http::fake([ + 'example.com/*' => Http::response('Not Found', 404), + ]); + + $result = $this->makeAction()('https://example.com/missing'); + + $this->assertInstanceOf(FetchResult::class, $result); + $this->assertSame(CrawlOutcomeEnum::Blocked4xx, $result->outcome); + $this->assertSame(404, $result->statusCode); + $this->assertIsString($result->errorMessage); + $this->assertStringContainsString('404', $result->errorMessage); + $this->assertNotNull($result->finalUrl); + } + + public function test_5xx_response_returns_blocked_5xx(): void + { + Http::fake([ + 'example.com/*' => Http::response('Service Unavailable', 503), + ]); + + $result = $this->makeAction()('https://example.com/page'); + + $this->assertInstanceOf(FetchResult::class, $result); + $this->assertSame(CrawlOutcomeEnum::Blocked5xx, $result->outcome); + $this->assertSame(503, $result->statusCode); + $this->assertIsString($result->errorMessage); + $this->assertStringContainsString('503', $result->errorMessage); + $this->assertNotNull($result->finalUrl); + } + + public function test_non_html_content_type_returns_rejected(): void + { + Http::fake([ + 'example.com/*' => Http::response( + 'PDF binary stuff', + 200, + ['Content-Type' => 'application/pdf'], + ), + ]); + + $result = $this->makeAction()('https://example.com/document.pdf'); + + $this->assertInstanceOf(FetchResult::class, $result); + $this->assertSame(CrawlOutcomeEnum::Rejected, $result->outcome); + $this->assertSame(200, $result->statusCode); + $this->assertIsString($result->errorMessage); + $this->assertStringContainsString('application/pdf', $result->errorMessage); + $this->assertNotNull($result->finalUrl); + $this->assertNull($result->title); + $this->assertNull($result->extractedText); + $this->assertEmpty($result->outboundLinks); + $this->assertNull($result->wordCount); + } + + public function test_text_html_with_charset_is_accepted(): void + { + Http::fake([ + 'example.com/*' => Http::response( + 'Hello charset world', + 200, + ['Content-Type' => 'text/html; charset=utf-8'], + ), + ]); + + $result = $this->makeAction()('https://example.com/page'); + + $this->assertInstanceOf(FetchResult::class, $result); + $this->assertSame(CrawlOutcomeEnum::Success, $result->outcome); + $this->assertSame(200, $result->statusCode); + } + + public function test_connection_failure_returns_failed(): void + { + Http::fake(function () { + throw new ConnectException( + 'Could not resolve host', + new Request('GET', 'https://example.com/page'), + null, + ['errno' => 6], + ); + }); + + $result = $this->makeAction()('https://example.com/page'); + + $this->assertInstanceOf(FetchResult::class, $result); + $this->assertSame(CrawlOutcomeEnum::Failed, $result->outcome); + $this->assertNull($result->statusCode); + $this->assertNull($result->finalUrl); + $this->assertIsString($result->errorMessage); + $this->assertNull($result->title); + $this->assertNull($result->extractedText); + $this->assertEmpty($result->outboundLinks); + $this->assertNull($result->wordCount); + } + + public function test_timeout_returns_timeout(): void + { + Http::fake(function () { + throw new ConnectException( + 'cURL error 28: Operation timed out', + new Request('GET', 'https://example.com/page'), + null, + ['errno' => 28], + ); + }); + + $result = $this->makeAction()('https://example.com/page'); + + $this->assertInstanceOf(FetchResult::class, $result); + $this->assertSame(CrawlOutcomeEnum::Timeout, $result->outcome); + $this->assertNull($result->statusCode); + $this->assertNull($result->finalUrl); + $this->assertIsString($result->errorMessage); + } + + private function makeAction(): FetchPageAction + { + return app(FetchPageAction::class); + } +}