12 - Make FetchResult fields nullable and add missing crawler config

2026-04-26 16:50:43 +02:00 · 2026-04-26 16:50:43 +02:00 · bb7906e193
commit bb7906e193
parent a9f2d689ae
3 changed files with 75 additions and 5 deletions
--- a/app/ValueObjects/FetchResult.php
+++ b/app/ValueObjects/FetchResult.php
@ -9,14 +9,17 @@

 class FetchResult
 {
+    /**
+     * @param  Collection<int, string>  $outboundLinks
+     */
    public function __construct(
        public CrawlOutcomeEnum $outcome,
-        public int $statusCode,
-        public string $finalUrl,
-        public string $title,
-        public string $extractedText,
+        public ?int $statusCode,
+        public ?string $finalUrl,
+        public ?string $title,
+        public ?string $extractedText,
        public Collection $outboundLinks,
-        public int $wordCount,
+        public ?int $wordCount,
        public ?string $errorMessage,
    ) {}
 }
--- a/config/crawler.php
+++ b/config/crawler.php
@ -0,0 +1,44 @@
+<?php
+
+declare(strict_types=1);
+
+return [
+    /*
+    |---------------------------------------------------------------------------
+    | HTTP timeout (seconds)
+    |---------------------------------------------------------------------------
+    |
+    | Hard cap on a single fetch. Guzzle's default is 0 (wait forever) — never
+    | acceptable for a crawler. Tune up cautiously; longer timeouts amplify the
+    | impact of slow targets on overall throughput.
+    |
+    */
+
+    'timeout' => env('CRAWLER_TIMEOUT', 10),
+
+    /*
+    |---------------------------------------------------------------------------
+    | Maximum redirects to follow
+    |---------------------------------------------------------------------------
+    |
+    | Guzzle default is 5. Cross-origin redirects are accepted in v0.1 — the
+    | search engine treats the post-redirect URL as the canonical one for
+    | indexing.
+    |
+    */
+
+    'max_redirects' => env('CRAWLER_MAX_REDIRECTS', 5),
+
+    /*
+    |---------------------------------------------------------------------------
+    | User-Agent
+    |---------------------------------------------------------------------------
+    |
+    | Identifies our crawler to target servers. The placeholder below is for
+    | v0.1 development; ticket #10 replaces it with the production identity
+    | and adds a `/bot` info page that the URL points at.
+    |
+    */
+
+    'user_agent' => env('CRAWLER_USER_AGENT', 'TroveBot/0.1 (+https://trove.lvl0.xyz/bot)'),
+];
--- a/tests/Unit/ValueObjects/FetchResultTest.php
+++ b/tests/Unit/ValueObjects/FetchResultTest.php
@ -34,4 +34,27 @@ public function test_it_exposes_all_fields(): void
        $this->assertSame(5, $result->wordCount);
        $this->assertNull($result->errorMessage);
    }
+
+    public function test_it_accepts_null_for_failure_outcome_fields(): void
+    {
+        $result = new FetchResult(
+            outcome: CrawlOutcomeEnum::Failed,
+            statusCode: null,
+            finalUrl: null,
+            title: null,
+            extractedText: null,
+            outboundLinks: collect(),
+            wordCount: null,
+            errorMessage: 'Could not connect',
+        );
+
+        $this->assertSame(CrawlOutcomeEnum::Failed, $result->outcome);
+        $this->assertNull($result->statusCode);
+        $this->assertNull($result->finalUrl);
+        $this->assertNull($result->title);
+        $this->assertNull($result->extractedText);
+        $this->assertSame([], $result->outboundLinks->all());
+        $this->assertNull($result->wordCount);
+        $this->assertSame('Could not connect', $result->errorMessage);
+    }
 }