12 - Add FetchPageAction with Http::fake-driven outcome paths

This commit is contained in:
myrmidex 2026-04-26 17:56:13 +02:00
parent bb7906e193
commit 1b7fbbfd0c
2 changed files with 254 additions and 0 deletions

View file

@ -0,0 +1,98 @@
<?php
declare(strict_types=1);
namespace App\Actions;
use App\Enums\CrawlOutcomeEnum;
use App\ValueObjects\FetchResult;
use GuzzleHttp\Exception\ConnectException;
use Illuminate\Http\Client\ConnectionException;
use Illuminate\Http\Client\Factory;
use Illuminate\Http\Client\Response;
class FetchPageAction
{
public function __construct(
private Factory $http,
) {}
public function __invoke(string $url): FetchResult
{
try {
$response = $this->http
->timeout(config('crawler.timeout'))
->withHeaders([
'User-Agent' => config('crawler.user_agent'),
'Accept' => 'text/html',
])
->withOptions([
'allow_redirects' => ['max' => config('crawler.max_redirects')],
])
->get($url);
} catch (ConnectionException|ConnectException $e) {
return $this->failureResult($e);
}
[$outcome, $error] = $this->validateResponse($response);
return new FetchResult(
outcome: $outcome,
statusCode: $response->status(),
finalUrl: $url,
title: null,
extractedText: null,
outboundLinks: collect(),
wordCount: null,
errorMessage: $error ?? null,
);
}
private function validateResponse(Response $response): array
{
$status = $response->status();
$statusStart = substr((string) $status, 0, 1);
if ($statusStart === '4') {
return [CrawlOutcomeEnum::Blocked4xx, "HTTP {$status}"];
}
if (str_starts_with((string) $status, '5')) {
return [CrawlOutcomeEnum::Blocked5xx, "HTTP {$status}"];
}
$contentType = $response->header('Content-Type');
if (! str_starts_with($contentType, 'text/html')) {
return [CrawlOutcomeEnum::Rejected, "Unsupported file type: {$contentType}"];
}
return [CrawlOutcomeEnum::Success, null];
}
private function failureResult(ConnectionException|ConnectException $e): FetchResult
{
$guzzleException = $e instanceof ConnectException
? $e
: ($e->getPrevious() instanceof ConnectException
? $e->getPrevious()
: null);
$errno = $guzzleException?->getHandlerContext()['errno'] ?? null;
$outcome = $errno === CURLE_OPERATION_TIMEDOUT
? CrawlOutcomeEnum::Timeout
: CrawlOutcomeEnum::Failed;
return new FetchResult(
outcome: $outcome,
statusCode: null,
finalUrl: null,
title: null,
extractedText: null,
outboundLinks: collect(),
wordCount: null,
errorMessage: $e->getMessage(),
);
}
}

View file

@ -0,0 +1,156 @@
<?php
declare(strict_types=1);
namespace Tests\Feature\Actions;
use App\Actions\FetchPageAction;
use App\Enums\CrawlOutcomeEnum;
use App\ValueObjects\FetchResult;
use GuzzleHttp\Exception\ConnectException;
use GuzzleHttp\Psr7\Request;
use Illuminate\Support\Facades\Http;
use Tests\TestCase;
class FetchPageActionTest extends TestCase
{
public function test_successful_html_fetch_returns_success_outcome(): void
{
Http::fake([
'example.com/*' => Http::response(
'<html><body>Hello</body></html>',
200,
['Content-Type' => 'text/html'],
),
]);
$result = $this->makeAction()('https://example.com/page');
$this->assertInstanceOf(FetchResult::class, $result);
$this->assertSame(CrawlOutcomeEnum::Success, $result->outcome);
$this->assertSame(200, $result->statusCode);
$this->assertNotNull($result->finalUrl);
}
public function test_4xx_response_returns_blocked_4xx(): void
{
Http::fake([
'example.com/*' => Http::response('Not Found', 404),
]);
$result = $this->makeAction()('https://example.com/missing');
$this->assertInstanceOf(FetchResult::class, $result);
$this->assertSame(CrawlOutcomeEnum::Blocked4xx, $result->outcome);
$this->assertSame(404, $result->statusCode);
$this->assertIsString($result->errorMessage);
$this->assertStringContainsString('404', $result->errorMessage);
$this->assertNotNull($result->finalUrl);
}
public function test_5xx_response_returns_blocked_5xx(): void
{
Http::fake([
'example.com/*' => Http::response('Service Unavailable', 503),
]);
$result = $this->makeAction()('https://example.com/page');
$this->assertInstanceOf(FetchResult::class, $result);
$this->assertSame(CrawlOutcomeEnum::Blocked5xx, $result->outcome);
$this->assertSame(503, $result->statusCode);
$this->assertIsString($result->errorMessage);
$this->assertStringContainsString('503', $result->errorMessage);
$this->assertNotNull($result->finalUrl);
}
public function test_non_html_content_type_returns_rejected(): void
{
Http::fake([
'example.com/*' => Http::response(
'PDF binary stuff',
200,
['Content-Type' => 'application/pdf'],
),
]);
$result = $this->makeAction()('https://example.com/document.pdf');
$this->assertInstanceOf(FetchResult::class, $result);
$this->assertSame(CrawlOutcomeEnum::Rejected, $result->outcome);
$this->assertSame(200, $result->statusCode);
$this->assertIsString($result->errorMessage);
$this->assertStringContainsString('application/pdf', $result->errorMessage);
$this->assertNotNull($result->finalUrl);
$this->assertNull($result->title);
$this->assertNull($result->extractedText);
$this->assertEmpty($result->outboundLinks);
$this->assertNull($result->wordCount);
}
public function test_text_html_with_charset_is_accepted(): void
{
Http::fake([
'example.com/*' => Http::response(
'<html><body>Hello charset world</body></html>',
200,
['Content-Type' => 'text/html; charset=utf-8'],
),
]);
$result = $this->makeAction()('https://example.com/page');
$this->assertInstanceOf(FetchResult::class, $result);
$this->assertSame(CrawlOutcomeEnum::Success, $result->outcome);
$this->assertSame(200, $result->statusCode);
}
public function test_connection_failure_returns_failed(): void
{
Http::fake(function () {
throw new ConnectException(
'Could not resolve host',
new Request('GET', 'https://example.com/page'),
null,
['errno' => 6],
);
});
$result = $this->makeAction()('https://example.com/page');
$this->assertInstanceOf(FetchResult::class, $result);
$this->assertSame(CrawlOutcomeEnum::Failed, $result->outcome);
$this->assertNull($result->statusCode);
$this->assertNull($result->finalUrl);
$this->assertIsString($result->errorMessage);
$this->assertNull($result->title);
$this->assertNull($result->extractedText);
$this->assertEmpty($result->outboundLinks);
$this->assertNull($result->wordCount);
}
public function test_timeout_returns_timeout(): void
{
Http::fake(function () {
throw new ConnectException(
'cURL error 28: Operation timed out',
new Request('GET', 'https://example.com/page'),
null,
['errno' => 28],
);
});
$result = $this->makeAction()('https://example.com/page');
$this->assertInstanceOf(FetchResult::class, $result);
$this->assertSame(CrawlOutcomeEnum::Timeout, $result->outcome);
$this->assertNull($result->statusCode);
$this->assertNull($result->finalUrl);
$this->assertIsString($result->errorMessage);
}
private function makeAction(): FetchPageAction
{
return app(FetchPageAction::class);
}
}