46 lines
1.5 KiB
PHP
46 lines
1.5 KiB
PHP
<?php
|
|
|
|
declare(strict_types=1);
|
|
|
|
return [
|
|
/*
|
|
|---------------------------------------------------------------------------
|
|
| HTTP timeout (seconds)
|
|
|---------------------------------------------------------------------------
|
|
|
|
|
| Hard cap on a single fetch. Guzzle's default is 0 (wait forever) — never
|
|
| acceptable for a crawler. Tune up cautiously; longer timeouts amplify the
|
|
| impact of slow targets on overall throughput.
|
|
|
|
|
*/
|
|
|
|
'timeout' => env('CRAWLER_TIMEOUT', 10),
|
|
|
|
/*
|
|
|---------------------------------------------------------------------------
|
|
| Maximum redirects to follow
|
|
|---------------------------------------------------------------------------
|
|
|
|
|
| Guzzle default is 5. Cross-origin redirects are accepted in v0.1 — the
|
|
| search engine treats the post-redirect URL as the canonical one for
|
|
| indexing.
|
|
|
|
|
*/
|
|
|
|
'max_redirects' => env('CRAWLER_MAX_REDIRECTS', 5),
|
|
|
|
/*
|
|
|---------------------------------------------------------------------------
|
|
| User-Agent
|
|
|---------------------------------------------------------------------------
|
|
|
|
|
| Identifies our crawler to target servers. The placeholder below is for
|
|
| v0.1 development; ticket #10 replaces it with the production identity
|
|
| and adds a `/bot` info page that the URL points at.
|
|
|
|
|
*/
|
|
|
|
'user_agent' => env('CRAWLER_USER_AGENT', 'TroveBot/0.1 (+https://trove.lvl0.xyz/bot)'),
|
|
|
|
'min_domain_delay_seconds' => env('CRAWLER_MIN_DOMAIN_DELAY_SECONDS', 10),
|
|
];
|