10 - Add /bot page with crawler identity and opt-out instructions
This commit is contained in:
parent
c80be24e6e
commit
69aa5d9d3e
3 changed files with 101 additions and 0 deletions
60
resources/views/bot.blade.php
Normal file
60
resources/views/bot.blade.php
Normal file
|
|
@ -0,0 +1,60 @@
|
||||||
|
<x-layout>
|
||||||
|
<main>
|
||||||
|
<h1>About TroveBot</h1>
|
||||||
|
|
||||||
|
<p>
|
||||||
|
<strong>Trove</strong> is a federated search engine for the small web,
|
||||||
|
seeded by fediverse attention and ranked by domain coherence rather than
|
||||||
|
commercial authority. <strong>TroveBot</strong> is its crawler — it
|
||||||
|
discovers and indexes URLs shared by people on the fediverse, then
|
||||||
|
follows the citations they make to find more of the small web.
|
||||||
|
</p>
|
||||||
|
|
||||||
|
<h2>Identity</h2>
|
||||||
|
|
||||||
|
<p>TroveBot identifies itself with the following User-Agent string:</p>
|
||||||
|
|
||||||
|
<pre><code>TroveBot/0.1 (+https://trove.lvl0.xyz/bot)</code></pre>
|
||||||
|
|
||||||
|
<h2>Crawling behavior</h2>
|
||||||
|
|
||||||
|
<ul>
|
||||||
|
<li>Respects <code>robots.txt</code> rules under <code>User-agent: TroveBot</code> (and the wildcard <code>User-agent: *</code> as a fallback).</li>
|
||||||
|
<li>Polite per-domain rate limit — at most a few requests per minute per host.</li>
|
||||||
|
<li>Follows up to 5 redirects per URL.</li>
|
||||||
|
<li>Fetches HTML only. PDFs, images, and other binary content are recorded as discovered but never re-fetched.</li>
|
||||||
|
<li>Does not execute JavaScript, does not crawl behind authentication, does not crawl URLs containing user credentials.</li>
|
||||||
|
</ul>
|
||||||
|
|
||||||
|
<h2>Opt out</h2>
|
||||||
|
|
||||||
|
<p>
|
||||||
|
Block TroveBot entirely by adding the following to your site's
|
||||||
|
<code>robots.txt</code>:
|
||||||
|
</p>
|
||||||
|
|
||||||
|
<pre><code>User-agent: TroveBot
|
||||||
|
Disallow: /</code></pre>
|
||||||
|
|
||||||
|
<p>
|
||||||
|
Or block specific paths:
|
||||||
|
</p>
|
||||||
|
|
||||||
|
<pre><code>User-agent: TroveBot
|
||||||
|
Disallow: /private/
|
||||||
|
Disallow: /admin/</code></pre>
|
||||||
|
|
||||||
|
<h2>Contact & source</h2>
|
||||||
|
|
||||||
|
<ul>
|
||||||
|
<li>
|
||||||
|
Issues, questions, abuse reports:
|
||||||
|
<a href="https://forge.lvl0.xyz/lvl0/trove/issues">forge.lvl0.xyz/lvl0/trove/issues</a>
|
||||||
|
</li>
|
||||||
|
<li>
|
||||||
|
Source code:
|
||||||
|
<a href="https://forge.lvl0.xyz/lvl0/trove">forge.lvl0.xyz/lvl0/trove</a>
|
||||||
|
</li>
|
||||||
|
</ul>
|
||||||
|
</main>
|
||||||
|
</x-layout>
|
||||||
|
|
@ -9,3 +9,5 @@
|
||||||
});
|
});
|
||||||
|
|
||||||
Route::view('/submit', 'urls.submit');
|
Route::view('/submit', 'urls.submit');
|
||||||
|
|
||||||
|
Route::view('/bot', 'bot');
|
||||||
|
|
|
||||||
39
tests/Feature/BotPageTest.php
Normal file
39
tests/Feature/BotPageTest.php
Normal file
|
|
@ -0,0 +1,39 @@
|
||||||
|
<?php
|
||||||
|
|
||||||
|
declare(strict_types=1);
|
||||||
|
|
||||||
|
namespace Tests\Feature;
|
||||||
|
|
||||||
|
use Tests\TestCase;
|
||||||
|
|
||||||
|
class BotPageTest extends TestCase
|
||||||
|
{
|
||||||
|
public function test_bot_page_renders_at_public_route(): void
|
||||||
|
{
|
||||||
|
$response = $this->get('/bot');
|
||||||
|
|
||||||
|
$response->assertStatus(200);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function test_bot_page_contains_user_agent_string(): void
|
||||||
|
{
|
||||||
|
$response = $this->get('/bot');
|
||||||
|
|
||||||
|
$response->assertSee('TroveBot/0.1 (+https://trove.lvl0.xyz/bot)', escape: false);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function test_bot_page_contains_robots_txt_opt_out_example(): void
|
||||||
|
{
|
||||||
|
$response = $this->get('/bot');
|
||||||
|
|
||||||
|
$response->assertSee('User-agent: TroveBot', escape: false);
|
||||||
|
$response->assertSee('Disallow: /', escape: false);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function test_bot_page_links_to_forge_repository(): void
|
||||||
|
{
|
||||||
|
$response = $this->get('/bot');
|
||||||
|
|
||||||
|
$response->assertSee('https://forge.lvl0.xyz/lvl0/trove', escape: false);
|
||||||
|
}
|
||||||
|
}
|
||||||
Loading…
Reference in a new issue