trove/config/crawler.php

46 lines
1.5 KiB
PHP

<?php
declare(strict_types=1);
return [
/*
|---------------------------------------------------------------------------
| HTTP timeout (seconds)
|---------------------------------------------------------------------------
|
| Hard cap on a single fetch. Guzzle's default is 0 (wait forever) — never
| acceptable for a crawler. Tune up cautiously; longer timeouts amplify the
| impact of slow targets on overall throughput.
|
*/
'timeout' => env('CRAWLER_TIMEOUT', 10),
/*
|---------------------------------------------------------------------------
| Maximum redirects to follow
|---------------------------------------------------------------------------
|
| Guzzle default is 5. Cross-origin redirects are accepted in v0.1 — the
| search engine treats the post-redirect URL as the canonical one for
| indexing.
|
*/
'max_redirects' => env('CRAWLER_MAX_REDIRECTS', 5),
/*
|---------------------------------------------------------------------------
| User-Agent
|---------------------------------------------------------------------------
|
| Identifies our crawler to target servers. The placeholder below is for
| v0.1 development; ticket #10 replaces it with the production identity
| and adds a `/bot` info page that the URL points at.
|
*/
'user_agent' => env('CRAWLER_USER_AGENT', 'TroveBot/0.1 (+https://trove.lvl0.xyz/bot)'),
'min_domain_delay_seconds' => env('CRAWLER_MIN_DOMAIN_DELAY_SECONDS', 10),
];