12 - Make FetchResult fields nullable and add missing crawler config
This commit is contained in:
parent
a9f2d689ae
commit
bb7906e193
3 changed files with 75 additions and 5 deletions
|
|
@ -9,14 +9,17 @@
|
||||||
|
|
||||||
class FetchResult
|
class FetchResult
|
||||||
{
|
{
|
||||||
|
/**
|
||||||
|
* @param Collection<int, string> $outboundLinks
|
||||||
|
*/
|
||||||
public function __construct(
|
public function __construct(
|
||||||
public CrawlOutcomeEnum $outcome,
|
public CrawlOutcomeEnum $outcome,
|
||||||
public int $statusCode,
|
public ?int $statusCode,
|
||||||
public string $finalUrl,
|
public ?string $finalUrl,
|
||||||
public string $title,
|
public ?string $title,
|
||||||
public string $extractedText,
|
public ?string $extractedText,
|
||||||
public Collection $outboundLinks,
|
public Collection $outboundLinks,
|
||||||
public int $wordCount,
|
public ?int $wordCount,
|
||||||
public ?string $errorMessage,
|
public ?string $errorMessage,
|
||||||
) {}
|
) {}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
44
config/crawler.php
Normal file
44
config/crawler.php
Normal file
|
|
@ -0,0 +1,44 @@
|
||||||
|
<?php
|
||||||
|
|
||||||
|
declare(strict_types=1);
|
||||||
|
|
||||||
|
return [
|
||||||
|
/*
|
||||||
|
|---------------------------------------------------------------------------
|
||||||
|
| HTTP timeout (seconds)
|
||||||
|
|---------------------------------------------------------------------------
|
||||||
|
|
|
||||||
|
| Hard cap on a single fetch. Guzzle's default is 0 (wait forever) — never
|
||||||
|
| acceptable for a crawler. Tune up cautiously; longer timeouts amplify the
|
||||||
|
| impact of slow targets on overall throughput.
|
||||||
|
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
'timeout' => env('CRAWLER_TIMEOUT', 10),
|
||||||
|
|
||||||
|
/*
|
||||||
|
|---------------------------------------------------------------------------
|
||||||
|
| Maximum redirects to follow
|
||||||
|
|---------------------------------------------------------------------------
|
||||||
|
|
|
||||||
|
| Guzzle default is 5. Cross-origin redirects are accepted in v0.1 — the
|
||||||
|
| search engine treats the post-redirect URL as the canonical one for
|
||||||
|
| indexing.
|
||||||
|
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
'max_redirects' => env('CRAWLER_MAX_REDIRECTS', 5),
|
||||||
|
|
||||||
|
/*
|
||||||
|
|---------------------------------------------------------------------------
|
||||||
|
| User-Agent
|
||||||
|
|---------------------------------------------------------------------------
|
||||||
|
|
|
||||||
|
| Identifies our crawler to target servers. The placeholder below is for
|
||||||
|
| v0.1 development; ticket #10 replaces it with the production identity
|
||||||
|
| and adds a `/bot` info page that the URL points at.
|
||||||
|
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
'user_agent' => env('CRAWLER_USER_AGENT', 'TroveBot/0.1 (+https://trove.lvl0.xyz/bot)'),
|
||||||
|
];
|
||||||
|
|
@ -34,4 +34,27 @@ public function test_it_exposes_all_fields(): void
|
||||||
$this->assertSame(5, $result->wordCount);
|
$this->assertSame(5, $result->wordCount);
|
||||||
$this->assertNull($result->errorMessage);
|
$this->assertNull($result->errorMessage);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public function test_it_accepts_null_for_failure_outcome_fields(): void
|
||||||
|
{
|
||||||
|
$result = new FetchResult(
|
||||||
|
outcome: CrawlOutcomeEnum::Failed,
|
||||||
|
statusCode: null,
|
||||||
|
finalUrl: null,
|
||||||
|
title: null,
|
||||||
|
extractedText: null,
|
||||||
|
outboundLinks: collect(),
|
||||||
|
wordCount: null,
|
||||||
|
errorMessage: 'Could not connect',
|
||||||
|
);
|
||||||
|
|
||||||
|
$this->assertSame(CrawlOutcomeEnum::Failed, $result->outcome);
|
||||||
|
$this->assertNull($result->statusCode);
|
||||||
|
$this->assertNull($result->finalUrl);
|
||||||
|
$this->assertNull($result->title);
|
||||||
|
$this->assertNull($result->extractedText);
|
||||||
|
$this->assertSame([], $result->outboundLinks->all());
|
||||||
|
$this->assertNull($result->wordCount);
|
||||||
|
$this->assertSame('Could not connect', $result->errorMessage);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue