12 - Make FetchResult fields nullable and add missing crawler config
This commit is contained in:
parent
a9f2d689ae
commit
bb7906e193
3 changed files with 75 additions and 5 deletions
|
|
@ -9,14 +9,17 @@
|
|||
|
||||
class FetchResult
|
||||
{
|
||||
/**
|
||||
* @param Collection<int, string> $outboundLinks
|
||||
*/
|
||||
public function __construct(
|
||||
public CrawlOutcomeEnum $outcome,
|
||||
public int $statusCode,
|
||||
public string $finalUrl,
|
||||
public string $title,
|
||||
public string $extractedText,
|
||||
public ?int $statusCode,
|
||||
public ?string $finalUrl,
|
||||
public ?string $title,
|
||||
public ?string $extractedText,
|
||||
public Collection $outboundLinks,
|
||||
public int $wordCount,
|
||||
public ?int $wordCount,
|
||||
public ?string $errorMessage,
|
||||
) {}
|
||||
}
|
||||
|
|
|
|||
44
config/crawler.php
Normal file
44
config/crawler.php
Normal file
|
|
@ -0,0 +1,44 @@
|
|||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
return [
|
||||
/*
|
||||
|---------------------------------------------------------------------------
|
||||
| HTTP timeout (seconds)
|
||||
|---------------------------------------------------------------------------
|
||||
|
|
||||
| Hard cap on a single fetch. Guzzle's default is 0 (wait forever) — never
|
||||
| acceptable for a crawler. Tune up cautiously; longer timeouts amplify the
|
||||
| impact of slow targets on overall throughput.
|
||||
|
|
||||
*/
|
||||
|
||||
'timeout' => env('CRAWLER_TIMEOUT', 10),
|
||||
|
||||
/*
|
||||
|---------------------------------------------------------------------------
|
||||
| Maximum redirects to follow
|
||||
|---------------------------------------------------------------------------
|
||||
|
|
||||
| Guzzle default is 5. Cross-origin redirects are accepted in v0.1 — the
|
||||
| search engine treats the post-redirect URL as the canonical one for
|
||||
| indexing.
|
||||
|
|
||||
*/
|
||||
|
||||
'max_redirects' => env('CRAWLER_MAX_REDIRECTS', 5),
|
||||
|
||||
/*
|
||||
|---------------------------------------------------------------------------
|
||||
| User-Agent
|
||||
|---------------------------------------------------------------------------
|
||||
|
|
||||
| Identifies our crawler to target servers. The placeholder below is for
|
||||
| v0.1 development; ticket #10 replaces it with the production identity
|
||||
| and adds a `/bot` info page that the URL points at.
|
||||
|
|
||||
*/
|
||||
|
||||
'user_agent' => env('CRAWLER_USER_AGENT', 'TroveBot/0.1 (+https://trove.lvl0.xyz/bot)'),
|
||||
];
|
||||
|
|
@ -34,4 +34,27 @@ public function test_it_exposes_all_fields(): void
|
|||
$this->assertSame(5, $result->wordCount);
|
||||
$this->assertNull($result->errorMessage);
|
||||
}
|
||||
|
||||
public function test_it_accepts_null_for_failure_outcome_fields(): void
|
||||
{
|
||||
$result = new FetchResult(
|
||||
outcome: CrawlOutcomeEnum::Failed,
|
||||
statusCode: null,
|
||||
finalUrl: null,
|
||||
title: null,
|
||||
extractedText: null,
|
||||
outboundLinks: collect(),
|
||||
wordCount: null,
|
||||
errorMessage: 'Could not connect',
|
||||
);
|
||||
|
||||
$this->assertSame(CrawlOutcomeEnum::Failed, $result->outcome);
|
||||
$this->assertNull($result->statusCode);
|
||||
$this->assertNull($result->finalUrl);
|
||||
$this->assertNull($result->title);
|
||||
$this->assertNull($result->extractedText);
|
||||
$this->assertSame([], $result->outboundLinks->all());
|
||||
$this->assertNull($result->wordCount);
|
||||
$this->assertSame('Could not connect', $result->errorMessage);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in a new issue