12 - Add FetchPageAction with Http::fake-driven outcome paths
This commit is contained in:
parent
bb7906e193
commit
1b7fbbfd0c
2 changed files with 254 additions and 0 deletions
98
app/Actions/FetchPageAction.php
Normal file
98
app/Actions/FetchPageAction.php
Normal file
|
|
@ -0,0 +1,98 @@
|
||||||
|
<?php
|
||||||
|
|
||||||
|
declare(strict_types=1);
|
||||||
|
|
||||||
|
namespace App\Actions;
|
||||||
|
|
||||||
|
use App\Enums\CrawlOutcomeEnum;
|
||||||
|
use App\ValueObjects\FetchResult;
|
||||||
|
use GuzzleHttp\Exception\ConnectException;
|
||||||
|
use Illuminate\Http\Client\ConnectionException;
|
||||||
|
use Illuminate\Http\Client\Factory;
|
||||||
|
use Illuminate\Http\Client\Response;
|
||||||
|
|
||||||
|
class FetchPageAction
|
||||||
|
{
|
||||||
|
public function __construct(
|
||||||
|
private Factory $http,
|
||||||
|
) {}
|
||||||
|
|
||||||
|
public function __invoke(string $url): FetchResult
|
||||||
|
{
|
||||||
|
try {
|
||||||
|
$response = $this->http
|
||||||
|
->timeout(config('crawler.timeout'))
|
||||||
|
->withHeaders([
|
||||||
|
'User-Agent' => config('crawler.user_agent'),
|
||||||
|
'Accept' => 'text/html',
|
||||||
|
])
|
||||||
|
->withOptions([
|
||||||
|
'allow_redirects' => ['max' => config('crawler.max_redirects')],
|
||||||
|
])
|
||||||
|
->get($url);
|
||||||
|
|
||||||
|
} catch (ConnectionException|ConnectException $e) {
|
||||||
|
return $this->failureResult($e);
|
||||||
|
}
|
||||||
|
|
||||||
|
[$outcome, $error] = $this->validateResponse($response);
|
||||||
|
|
||||||
|
return new FetchResult(
|
||||||
|
outcome: $outcome,
|
||||||
|
statusCode: $response->status(),
|
||||||
|
finalUrl: $url,
|
||||||
|
title: null,
|
||||||
|
extractedText: null,
|
||||||
|
outboundLinks: collect(),
|
||||||
|
wordCount: null,
|
||||||
|
errorMessage: $error ?? null,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
private function validateResponse(Response $response): array
|
||||||
|
{
|
||||||
|
$status = $response->status();
|
||||||
|
$statusStart = substr((string) $status, 0, 1);
|
||||||
|
|
||||||
|
if ($statusStart === '4') {
|
||||||
|
return [CrawlOutcomeEnum::Blocked4xx, "HTTP {$status}"];
|
||||||
|
}
|
||||||
|
|
||||||
|
if (str_starts_with((string) $status, '5')) {
|
||||||
|
return [CrawlOutcomeEnum::Blocked5xx, "HTTP {$status}"];
|
||||||
|
}
|
||||||
|
|
||||||
|
$contentType = $response->header('Content-Type');
|
||||||
|
if (! str_starts_with($contentType, 'text/html')) {
|
||||||
|
return [CrawlOutcomeEnum::Rejected, "Unsupported file type: {$contentType}"];
|
||||||
|
}
|
||||||
|
|
||||||
|
return [CrawlOutcomeEnum::Success, null];
|
||||||
|
}
|
||||||
|
|
||||||
|
private function failureResult(ConnectionException|ConnectException $e): FetchResult
|
||||||
|
{
|
||||||
|
$guzzleException = $e instanceof ConnectException
|
||||||
|
? $e
|
||||||
|
: ($e->getPrevious() instanceof ConnectException
|
||||||
|
? $e->getPrevious()
|
||||||
|
: null);
|
||||||
|
|
||||||
|
$errno = $guzzleException?->getHandlerContext()['errno'] ?? null;
|
||||||
|
|
||||||
|
$outcome = $errno === CURLE_OPERATION_TIMEDOUT
|
||||||
|
? CrawlOutcomeEnum::Timeout
|
||||||
|
: CrawlOutcomeEnum::Failed;
|
||||||
|
|
||||||
|
return new FetchResult(
|
||||||
|
outcome: $outcome,
|
||||||
|
statusCode: null,
|
||||||
|
finalUrl: null,
|
||||||
|
title: null,
|
||||||
|
extractedText: null,
|
||||||
|
outboundLinks: collect(),
|
||||||
|
wordCount: null,
|
||||||
|
errorMessage: $e->getMessage(),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
156
tests/Feature/Actions/FetchPageActionTest.php
Normal file
156
tests/Feature/Actions/FetchPageActionTest.php
Normal file
|
|
@ -0,0 +1,156 @@
|
||||||
|
<?php
|
||||||
|
|
||||||
|
declare(strict_types=1);
|
||||||
|
|
||||||
|
namespace Tests\Feature\Actions;
|
||||||
|
|
||||||
|
use App\Actions\FetchPageAction;
|
||||||
|
use App\Enums\CrawlOutcomeEnum;
|
||||||
|
use App\ValueObjects\FetchResult;
|
||||||
|
use GuzzleHttp\Exception\ConnectException;
|
||||||
|
use GuzzleHttp\Psr7\Request;
|
||||||
|
use Illuminate\Support\Facades\Http;
|
||||||
|
use Tests\TestCase;
|
||||||
|
|
||||||
|
class FetchPageActionTest extends TestCase
|
||||||
|
{
|
||||||
|
public function test_successful_html_fetch_returns_success_outcome(): void
|
||||||
|
{
|
||||||
|
Http::fake([
|
||||||
|
'example.com/*' => Http::response(
|
||||||
|
'<html><body>Hello</body></html>',
|
||||||
|
200,
|
||||||
|
['Content-Type' => 'text/html'],
|
||||||
|
),
|
||||||
|
]);
|
||||||
|
|
||||||
|
$result = $this->makeAction()('https://example.com/page');
|
||||||
|
|
||||||
|
$this->assertInstanceOf(FetchResult::class, $result);
|
||||||
|
$this->assertSame(CrawlOutcomeEnum::Success, $result->outcome);
|
||||||
|
$this->assertSame(200, $result->statusCode);
|
||||||
|
$this->assertNotNull($result->finalUrl);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function test_4xx_response_returns_blocked_4xx(): void
|
||||||
|
{
|
||||||
|
Http::fake([
|
||||||
|
'example.com/*' => Http::response('Not Found', 404),
|
||||||
|
]);
|
||||||
|
|
||||||
|
$result = $this->makeAction()('https://example.com/missing');
|
||||||
|
|
||||||
|
$this->assertInstanceOf(FetchResult::class, $result);
|
||||||
|
$this->assertSame(CrawlOutcomeEnum::Blocked4xx, $result->outcome);
|
||||||
|
$this->assertSame(404, $result->statusCode);
|
||||||
|
$this->assertIsString($result->errorMessage);
|
||||||
|
$this->assertStringContainsString('404', $result->errorMessage);
|
||||||
|
$this->assertNotNull($result->finalUrl);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function test_5xx_response_returns_blocked_5xx(): void
|
||||||
|
{
|
||||||
|
Http::fake([
|
||||||
|
'example.com/*' => Http::response('Service Unavailable', 503),
|
||||||
|
]);
|
||||||
|
|
||||||
|
$result = $this->makeAction()('https://example.com/page');
|
||||||
|
|
||||||
|
$this->assertInstanceOf(FetchResult::class, $result);
|
||||||
|
$this->assertSame(CrawlOutcomeEnum::Blocked5xx, $result->outcome);
|
||||||
|
$this->assertSame(503, $result->statusCode);
|
||||||
|
$this->assertIsString($result->errorMessage);
|
||||||
|
$this->assertStringContainsString('503', $result->errorMessage);
|
||||||
|
$this->assertNotNull($result->finalUrl);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function test_non_html_content_type_returns_rejected(): void
|
||||||
|
{
|
||||||
|
Http::fake([
|
||||||
|
'example.com/*' => Http::response(
|
||||||
|
'PDF binary stuff',
|
||||||
|
200,
|
||||||
|
['Content-Type' => 'application/pdf'],
|
||||||
|
),
|
||||||
|
]);
|
||||||
|
|
||||||
|
$result = $this->makeAction()('https://example.com/document.pdf');
|
||||||
|
|
||||||
|
$this->assertInstanceOf(FetchResult::class, $result);
|
||||||
|
$this->assertSame(CrawlOutcomeEnum::Rejected, $result->outcome);
|
||||||
|
$this->assertSame(200, $result->statusCode);
|
||||||
|
$this->assertIsString($result->errorMessage);
|
||||||
|
$this->assertStringContainsString('application/pdf', $result->errorMessage);
|
||||||
|
$this->assertNotNull($result->finalUrl);
|
||||||
|
$this->assertNull($result->title);
|
||||||
|
$this->assertNull($result->extractedText);
|
||||||
|
$this->assertEmpty($result->outboundLinks);
|
||||||
|
$this->assertNull($result->wordCount);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function test_text_html_with_charset_is_accepted(): void
|
||||||
|
{
|
||||||
|
Http::fake([
|
||||||
|
'example.com/*' => Http::response(
|
||||||
|
'<html><body>Hello charset world</body></html>',
|
||||||
|
200,
|
||||||
|
['Content-Type' => 'text/html; charset=utf-8'],
|
||||||
|
),
|
||||||
|
]);
|
||||||
|
|
||||||
|
$result = $this->makeAction()('https://example.com/page');
|
||||||
|
|
||||||
|
$this->assertInstanceOf(FetchResult::class, $result);
|
||||||
|
$this->assertSame(CrawlOutcomeEnum::Success, $result->outcome);
|
||||||
|
$this->assertSame(200, $result->statusCode);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function test_connection_failure_returns_failed(): void
|
||||||
|
{
|
||||||
|
Http::fake(function () {
|
||||||
|
throw new ConnectException(
|
||||||
|
'Could not resolve host',
|
||||||
|
new Request('GET', 'https://example.com/page'),
|
||||||
|
null,
|
||||||
|
['errno' => 6],
|
||||||
|
);
|
||||||
|
});
|
||||||
|
|
||||||
|
$result = $this->makeAction()('https://example.com/page');
|
||||||
|
|
||||||
|
$this->assertInstanceOf(FetchResult::class, $result);
|
||||||
|
$this->assertSame(CrawlOutcomeEnum::Failed, $result->outcome);
|
||||||
|
$this->assertNull($result->statusCode);
|
||||||
|
$this->assertNull($result->finalUrl);
|
||||||
|
$this->assertIsString($result->errorMessage);
|
||||||
|
$this->assertNull($result->title);
|
||||||
|
$this->assertNull($result->extractedText);
|
||||||
|
$this->assertEmpty($result->outboundLinks);
|
||||||
|
$this->assertNull($result->wordCount);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function test_timeout_returns_timeout(): void
|
||||||
|
{
|
||||||
|
Http::fake(function () {
|
||||||
|
throw new ConnectException(
|
||||||
|
'cURL error 28: Operation timed out',
|
||||||
|
new Request('GET', 'https://example.com/page'),
|
||||||
|
null,
|
||||||
|
['errno' => 28],
|
||||||
|
);
|
||||||
|
});
|
||||||
|
|
||||||
|
$result = $this->makeAction()('https://example.com/page');
|
||||||
|
|
||||||
|
$this->assertInstanceOf(FetchResult::class, $result);
|
||||||
|
$this->assertSame(CrawlOutcomeEnum::Timeout, $result->outcome);
|
||||||
|
$this->assertNull($result->statusCode);
|
||||||
|
$this->assertNull($result->finalUrl);
|
||||||
|
$this->assertIsString($result->errorMessage);
|
||||||
|
}
|
||||||
|
|
||||||
|
private function makeAction(): FetchPageAction
|
||||||
|
{
|
||||||
|
return app(FetchPageAction::class);
|
||||||
|
}
|
||||||
|
}
|
||||||
Loading…
Reference in a new issue