12 - Make FetchResult fields nullable and add missing crawler config

This commit is contained in:
myrmidex 2026-04-26 16:50:43 +02:00
parent a9f2d689ae
commit bb7906e193
3 changed files with 75 additions and 5 deletions

View file

@ -9,14 +9,17 @@
class FetchResult
{
/**
* @param Collection<int, string> $outboundLinks
*/
public function __construct(
public CrawlOutcomeEnum $outcome,
public int $statusCode,
public string $finalUrl,
public string $title,
public string $extractedText,
public ?int $statusCode,
public ?string $finalUrl,
public ?string $title,
public ?string $extractedText,
public Collection $outboundLinks,
public int $wordCount,
public ?int $wordCount,
public ?string $errorMessage,
) {}
}

44
config/crawler.php Normal file
View file

@ -0,0 +1,44 @@
<?php
declare(strict_types=1);
return [
/*
|---------------------------------------------------------------------------
| HTTP timeout (seconds)
|---------------------------------------------------------------------------
|
| Hard cap on a single fetch. Guzzle's default is 0 (wait forever) never
| acceptable for a crawler. Tune up cautiously; longer timeouts amplify the
| impact of slow targets on overall throughput.
|
*/
'timeout' => env('CRAWLER_TIMEOUT', 10),
/*
|---------------------------------------------------------------------------
| Maximum redirects to follow
|---------------------------------------------------------------------------
|
| Guzzle default is 5. Cross-origin redirects are accepted in v0.1 the
| search engine treats the post-redirect URL as the canonical one for
| indexing.
|
*/
'max_redirects' => env('CRAWLER_MAX_REDIRECTS', 5),
/*
|---------------------------------------------------------------------------
| User-Agent
|---------------------------------------------------------------------------
|
| Identifies our crawler to target servers. The placeholder below is for
| v0.1 development; ticket #10 replaces it with the production identity
| and adds a `/bot` info page that the URL points at.
|
*/
'user_agent' => env('CRAWLER_USER_AGENT', 'TroveBot/0.1 (+https://trove.lvl0.xyz/bot)'),
];

View file

@ -34,4 +34,27 @@ public function test_it_exposes_all_fields(): void
$this->assertSame(5, $result->wordCount);
$this->assertNull($result->errorMessage);
}
public function test_it_accepts_null_for_failure_outcome_fields(): void
{
$result = new FetchResult(
outcome: CrawlOutcomeEnum::Failed,
statusCode: null,
finalUrl: null,
title: null,
extractedText: null,
outboundLinks: collect(),
wordCount: null,
errorMessage: 'Could not connect',
);
$this->assertSame(CrawlOutcomeEnum::Failed, $result->outcome);
$this->assertNull($result->statusCode);
$this->assertNull($result->finalUrl);
$this->assertNull($result->title);
$this->assertNull($result->extractedText);
$this->assertSame([], $result->outboundLinks->all());
$this->assertNull($result->wordCount);
$this->assertSame('Could not connect', $result->errorMessage);
}
}