diff --git a/app/ValueObjects/FetchResult.php b/app/ValueObjects/FetchResult.php index e8339dc..86e04bb 100644 --- a/app/ValueObjects/FetchResult.php +++ b/app/ValueObjects/FetchResult.php @@ -9,14 +9,17 @@ class FetchResult { + /** + * @param Collection $outboundLinks + */ public function __construct( public CrawlOutcomeEnum $outcome, - public int $statusCode, - public string $finalUrl, - public string $title, - public string $extractedText, + public ?int $statusCode, + public ?string $finalUrl, + public ?string $title, + public ?string $extractedText, public Collection $outboundLinks, - public int $wordCount, + public ?int $wordCount, public ?string $errorMessage, ) {} } diff --git a/config/crawler.php b/config/crawler.php new file mode 100644 index 0000000..0c9caeb --- /dev/null +++ b/config/crawler.php @@ -0,0 +1,44 @@ + env('CRAWLER_TIMEOUT', 10), + + /* + |--------------------------------------------------------------------------- + | Maximum redirects to follow + |--------------------------------------------------------------------------- + | + | Guzzle default is 5. Cross-origin redirects are accepted in v0.1 — the + | search engine treats the post-redirect URL as the canonical one for + | indexing. + | + */ + + 'max_redirects' => env('CRAWLER_MAX_REDIRECTS', 5), + + /* + |--------------------------------------------------------------------------- + | User-Agent + |--------------------------------------------------------------------------- + | + | Identifies our crawler to target servers. The placeholder below is for + | v0.1 development; ticket #10 replaces it with the production identity + | and adds a `/bot` info page that the URL points at. + | + */ + + 'user_agent' => env('CRAWLER_USER_AGENT', 'TroveBot/0.1 (+https://trove.lvl0.xyz/bot)'), +]; diff --git a/tests/Unit/ValueObjects/FetchResultTest.php b/tests/Unit/ValueObjects/FetchResultTest.php index 852ea78..c3185f8 100644 --- a/tests/Unit/ValueObjects/FetchResultTest.php +++ b/tests/Unit/ValueObjects/FetchResultTest.php @@ -34,4 +34,27 @@ public function test_it_exposes_all_fields(): void $this->assertSame(5, $result->wordCount); $this->assertNull($result->errorMessage); } + + public function test_it_accepts_null_for_failure_outcome_fields(): void + { + $result = new FetchResult( + outcome: CrawlOutcomeEnum::Failed, + statusCode: null, + finalUrl: null, + title: null, + extractedText: null, + outboundLinks: collect(), + wordCount: null, + errorMessage: 'Could not connect', + ); + + $this->assertSame(CrawlOutcomeEnum::Failed, $result->outcome); + $this->assertNull($result->statusCode); + $this->assertNull($result->finalUrl); + $this->assertNull($result->title); + $this->assertNull($result->extractedText); + $this->assertSame([], $result->outboundLinks->all()); + $this->assertNull($result->wordCount); + $this->assertSame('Could not connect', $result->errorMessage); + } }