Merge release/0.1.0 into main
Some checks failed
Build and Push Docker Image / build (push) Has been cancelled

This commit is contained in:
myrmidex 2026-04-29 23:29:27 +02:00
commit 7e62cbc613
107 changed files with 7359 additions and 342 deletions

49
.dockerignore Normal file
View file

@ -0,0 +1,49 @@
# Version control
.git
.gitignore
.gitattributes
# Dev environment
shell.nix
Dockerfile.dev
docker/
# Tests (not needed in prod image)
tests/
phpunit.xml
.phpunit.result.cache
phpstan.neon
# Dependencies (rebuilt during image build)
node_modules/
vendor/
# Build artifacts (frontend stage produces these)
public/build/
public/hot
# Editor / OS
.editorconfig
.idea/
.vscode/
.DS_Store
*.swp
*.swo
# Env / secrets
.env
.env.*
!.env.example
# Logs and runtime caches
storage/logs/*.log
storage/framework/cache/data/
storage/framework/sessions/
storage/framework/views/
# CI
.forgejo/
# Docs / project meta
README.md
LICENSE

View file

@ -61,3 +61,9 @@ AWS_BUCKET=
AWS_USE_PATH_STYLE_ENDPOINT=false AWS_USE_PATH_STYLE_ENDPOINT=false
VITE_APP_NAME="${APP_NAME}" VITE_APP_NAME="${APP_NAME}"
CRAWLER_MIN_DOMAIN_DELAY_SECONDS=10
NTFY_URL=
NTFY_TOPIC=
NTFY_THRESHOLD=

View file

@ -5,8 +5,7 @@ on:
branches: [main] branches: [main]
tags: ['v*'] tags: ['v*']
paths: paths:
- 'Dockerfile' - 'docker/prod/Dockerfile'
- 'docker/**'
- 'app/**' - 'app/**'
- 'bootstrap/**' - 'bootstrap/**'
- 'config/**' - 'config/**'
@ -51,6 +50,6 @@ jobs:
uses: https://data.forgejo.org/docker/build-push-action@v5 uses: https://data.forgejo.org/docker/build-push-action@v5
with: with:
context: . context: .
file: Dockerfile file: docker/prod/Dockerfile
push: true push: true
tags: ${{ steps.meta.outputs.tags }} tags: ${{ steps.meta.outputs.tags }}

126
README.md
View file

@ -1,6 +1,128 @@
# trove # Trove
A small web search engine. A federated search engine for the small web. Seeded by fediverse attention, ranked by domain coherence rather than commercial authority.
## Tech stack
Laravel 13 · Livewire 4 · PostgreSQL 17 (tsvector FTS) · Redis 7 · FrankenPHP · Vite 8 · Tailwind 4.
## Local development
Requires [Nix](https://nixos.org/download/) and [Podman](https://podman.io/).
```sh
nix-shell # enter dev shell
dev-up # start app, db, redis
```
App: `http://localhost:8200` · Vite HMR: `http://localhost:5175`
Other helpers inside the nix shell: `dev-down`, `dev-rebuild`, `dev-shell`, `dev-artisan <cmd>`, `dev-logs`.
## Self-hosting
Trove ships as a Docker image published to `forge.lvl0.xyz/lvl0/trove`. You provide the compose/stack config.
### Required environment
| Variable | Purpose |
|---|---|
| `APP_KEY` | Laravel app key. Generate with `docker run --rm forge.lvl0.xyz/lvl0/trove:latest php artisan key:generate --show`. **Must persist across deployments** or sessions/encrypted data break. |
| `APP_URL` | Public URL, e.g. `https://trove.example.org` |
| `DB_DATABASE`, `DB_USERNAME`, `DB_PASSWORD` | PostgreSQL credentials |
| `DB_HOST` | Hostname of the PostgreSQL service. Default `db`. Override if your service is named differently. |
| `REDIS_HOST` | Hostname of the Redis service. Default `redis`. Override if your service is named differently. |
### Services you need to provide
- **App**: pull `forge.lvl0.xyz/lvl0/trove:latest` (or a pinned `v*` tag). Exposes port `8000` inside the container. The image runs migrations and warms caches on boot.
- **Worker**: same image as `app`, with `command: php artisan queue:work --tries=3 --max-time=3600`. Processes the crawler queue (URL fetching, content extraction, retries). Crawls won't actually run without this — `app` only enqueues work. **Required for the crawler to function.**
- **PostgreSQL 17**. Hostname must be reachable as `db` (default) or set `DB_HOST`. Persist `/var/lib/postgresql/data`.
- **Redis 7** with `--appendonly yes` (queue jobs persist across restarts). Hostname `redis` or set `REDIS_HOST`.
On first boot the startup script waits for PostgreSQL, warms caches, then runs `php artisan migrate --force` automatically. The 60-second wait loop covers slow PG init; it exits with a clear error if PG never becomes reachable.
### Volumes
- `/app/storage` — Laravel writable paths (logs, cached views, uploads). Persist this.
### Healthcheck
The image exposes `GET /up` (Laravel's built-in health route). The Dockerfile declares a HEALTHCHECK; your orchestrator can use `curl -fsS http://localhost:8000/up` for liveness.
### Example compose stack
A minimal reference — adapt for your infra. DockGE, Portainer, `docker compose`, Kubernetes, and bare `podman play kube` all work with equivalent configs.
```yaml
services:
app:
image: forge.lvl0.xyz/lvl0/trove:latest
restart: always
ports: ["${APP_PORT:-8400}:8000"]
environment:
APP_KEY: "${APP_KEY}"
APP_URL: "${APP_URL}"
DB_DATABASE: "${DB_DATABASE}"
DB_USERNAME: "${DB_USERNAME}"
DB_PASSWORD: "${DB_PASSWORD}"
volumes:
- app_storage:/app/storage
depends_on:
db: { condition: service_healthy }
redis: { condition: service_healthy }
worker:
image: forge.lvl0.xyz/lvl0/trove:latest
restart: always
command: php artisan queue:work --tries=3 --max-time=3600
environment:
APP_KEY: "${APP_KEY}"
APP_URL: "${APP_URL}"
DB_DATABASE: "${DB_DATABASE}"
DB_USERNAME: "${DB_USERNAME}"
DB_PASSWORD: "${DB_PASSWORD}"
volumes:
- app_storage:/app/storage
depends_on:
db: { condition: service_healthy }
redis: { condition: service_healthy }
db:
image: postgres:17-alpine
restart: always
environment:
POSTGRES_DB: "${DB_DATABASE}"
POSTGRES_USER: "${DB_USERNAME}"
POSTGRES_PASSWORD: "${DB_PASSWORD}"
volumes:
- db_data:/var/lib/postgresql/data
healthcheck:
test: ["CMD-SHELL", "pg_isready -U $$POSTGRES_USER -d $$POSTGRES_DB"]
interval: 10s
retries: 5
start_period: 10s
redis:
image: redis:7-alpine
restart: always
command: redis-server --appendonly yes
volumes:
- redis_data:/data
healthcheck:
test: ["CMD", "redis-cli", "ping"]
interval: 10s
retries: 5
volumes:
db_data:
redis_data:
app_storage:
```
### Upgrades
Pull the new image tag, recreate the app container. Migrations run on boot (`php artisan migrate --force` in the startup script). Rollback by pointing at the previous `v*` tag.
---- ----

View file

@ -0,0 +1,194 @@
<?php
declare(strict_types=1);
namespace App\Actions;
use App\Enums\CrawlOutcomeEnum;
use App\Services\LanguageDetectionService;
use App\Services\UrlService;
use App\ValueObjects\FetchResult;
use fivefilters\Readability\Configuration;
use fivefilters\Readability\Readability;
use GuzzleHttp\Exception\ConnectException;
use Illuminate\Http\Client\ConnectionException;
use Illuminate\Http\Client\Factory;
use Illuminate\Http\Client\Response;
use InvalidArgumentException;
use League\Uri\BaseUri;
use Symfony\Component\DomCrawler\Crawler;
use Throwable;
class FetchPageAction
{
private const MIN_WORDS_FOR_TEXT_DETECTION = 20;
private const MIN_TEXT_DETECTION_CONFIDENCE = 0.30;
public function __construct(
private Factory $http,
private UrlService $urlService,
private LanguageDetectionService $languageDetection,
) {}
public function __invoke(string $url): FetchResult
{
try {
$response = $this->http
->timeout(config('crawler.timeout'))
->withHeaders([
'User-Agent' => config('crawler.user_agent'),
'Accept' => 'text/html',
])
->withOptions([
'allow_redirects' => ['max' => config('crawler.max_redirects')],
])
->get($url);
} catch (ConnectionException|ConnectException $e) {
return $this->failureResult($e);
}
[$outcome, $error] = $this->validateResponse($response);
if ($outcome === CrawlOutcomeEnum::Success) {
[$title, $extractedText, $links, $crawler] = $this->extractTitleTextAndLinks($response->body(), $url);
$wordCount = $extractedText !== '' ? count(preg_split('/\s+/u', trim($extractedText))) : 0;
[$language, $languageConfidence] = $this->detectLanguage($crawler, $extractedText, $wordCount);
}
return new FetchResult(
outcome: $outcome,
statusCode: $response->status(),
finalUrl: $url,
title: $title ?? null,
extractedText: $extractedText ?? null,
outboundLinks: $links ?? collect(),
wordCount: $wordCount ?? null,
errorMessage: $error ?? null,
language: $language ?? null,
languageConfidence: $languageConfidence ?? null,
);
}
private function validateResponse(Response $response): array
{
$status = $response->status();
if ($status >= 400 && $status < 500) {
return [CrawlOutcomeEnum::Blocked4xx, "HTTP {$status}"];
}
if ($status >= 500) {
return [CrawlOutcomeEnum::Blocked5xx, "HTTP {$status}"];
}
$contentType = $response->header('Content-Type');
if (! str_starts_with(mb_strtolower($contentType), 'text/html')) {
return [CrawlOutcomeEnum::Rejected, "Unsupported file type: {$contentType}"];
}
return [CrawlOutcomeEnum::Success, null];
}
private function failureResult(ConnectionException|ConnectException $e): FetchResult
{
$guzzleException = $e instanceof ConnectException
? $e
: ($e->getPrevious() instanceof ConnectException
? $e->getPrevious()
: null);
$errno = $guzzleException?->getHandlerContext()['errno'] ?? null;
$outcome = $errno === CURLE_OPERATION_TIMEDOUT
? CrawlOutcomeEnum::Timeout
: CrawlOutcomeEnum::Failed;
return new FetchResult(
outcome: $outcome,
statusCode: null,
finalUrl: null,
title: null,
extractedText: null,
outboundLinks: collect(),
wordCount: null,
errorMessage: $e->getMessage(),
);
}
private function extractTitleTextAndLinks(string $body, string $url): array
{
$crawler = new Crawler($body);
$title = $crawler->filter('title')->count() > 0
? trim($crawler->filter('title')->text())
: null;
$readability = new Readability(new Configuration);
$readability->parse($body);
$mainContent = $readability->getContent() ?? '';
$extractedText = trim(strip_tags($mainContent));
$links = collect();
if ($mainContent !== '') {
$linkCrawler = new Crawler($mainContent);
if ($linkCrawler->filter('a[href]')->count() > 0) {
$links = collect($linkCrawler->filter('a[href]')->extract(['href']));
}
}
$linksResolved = $links
->map(fn (string $href) => $this->resolveAndValidateLink($href, $url))
->filter()
->unique()
->values();
return [$title, $extractedText, $linksResolved, $crawler];
}
private function resolveAndValidateLink(string $href, string $finalUrl): ?string
{
try {
$resolved = (string) BaseUri::from($finalUrl)->resolve($href);
$resolved = strstr($resolved, '#', true) ?: $resolved;
} catch (Throwable) {
return null;
}
if ($resolved === $finalUrl) {
return null;
}
try {
$this->urlService->host($resolved);
} catch (InvalidArgumentException) {
return null;
}
return $resolved;
}
/**
* @return array{0: ?string, 1: ?float}
*/
private function detectLanguage(Crawler $crawler, string $extractedText, ?int $wordCount = null): array
{
if ($wordCount >= self::MIN_WORDS_FOR_TEXT_DETECTION) {
$result = $this->languageDetection->detect($extractedText);
if ($result !== null && $result[1] >= self::MIN_TEXT_DETECTION_CONFIDENCE) {
return [$result[0], $result[1]];
}
}
$lang = $crawler->filter('html')->count() > 0
? trim($crawler->filter('html')->attr('lang') ?? '')
: '';
if ($lang !== '' && strlen($lang) <= 35) {
return [$lang, 1.0];
}
return [null, null];
}
}

View file

@ -0,0 +1,22 @@
<?php
declare(strict_types=1);
namespace App\Actions;
use App\Enums\PageStatusEnum;
use App\Models\Page;
class RegisterDiscoveredPageAction
{
public function __invoke(string $url, ?int $instanceId = null): Page
{
return Page::firstOrCreate(
['url' => $url],
[
'status' => PageStatusEnum::Discovered,
'instance_id' => $instanceId,
],
);
}
}

View file

@ -0,0 +1,60 @@
<?php
declare(strict_types=1);
namespace App\Enums;
enum CrawlOutcomeEnum: string
{
case Success = 'success';
case Failed = 'failed';
case Timeout = 'timeout';
case BlockedRobots = 'blocked_robots';
case Blocked4xx = 'blocked_4xx';
case Blocked5xx = 'blocked_5xx';
/**
* The HTTP fetch succeeded (2xx) but the response is unindexable in v0.1
* (non-HTML Content-Type). Worker MUST also write `pages.status = Rejected`
* on this outcome do NOT treat as Failed. Page row STAYS in the DB to
* prevent re-discovery loops as fediverse re-shares the URL.
*/
case Rejected = 'rejected';
/**
* The PageStatusEnum value the parent `pages` row should land on for this outcome.
*/
public function toPageStatus(): PageStatusEnum
{
return match ($this) {
self::Success => PageStatusEnum::Fetched,
self::Rejected => PageStatusEnum::Rejected,
self::Failed,
self::Timeout,
self::BlockedRobots,
self::Blocked4xx,
self::Blocked5xx => PageStatusEnum::Failed,
};
}
/**
* True if the worker should retry this outcome (transient failures only).
* Permanent failures (4xx, robots block, rejected content type) and successes do not retry.
*/
public function isRetryable(): bool
{
return match ($this) {
self::Failed, self::Timeout, self::Blocked5xx => true,
self::Success, self::Rejected, self::BlockedRobots, self::Blocked4xx => false,
};
}
/**
* True if the worker should register the outbound links discovered during the fetch.
* Only Success outcomes have meaningful links; everything else either failed or returned no usable HTML.
*/
public function shouldRegisterOutboundLinks(): bool
{
return $this === self::Success;
}
}

View file

@ -0,0 +1,20 @@
<?php
declare(strict_types=1);
namespace App\Enums;
enum PageStatusEnum: string
{
case Discovered = 'discovered';
case Fetched = 'fetched';
case Failed = 'failed';
/**
* The crawler fetched the page but rejected it as unindexable in v0.1
* (non-HTML Content-Type). Page row stays as a sentinel preventing
* re-discovery loops; future re-crawl could flip status back to
* Discovered Fetched if the URL starts serving HTML.
*/
case Rejected = 'rejected';
}

View file

@ -0,0 +1,23 @@
<?php
declare(strict_types=1);
namespace App\Http\Controllers\Admin;
use App\Enums\PageStatusEnum;
use App\Http\Controllers\Controller;
use Illuminate\View\View;
use Lvl0\FediDiscover\Models\Instance;
class InstancesController extends Controller
{
public function index(): View
{
$instances = Instance::withCount([
'pages',
'pages as failed_pages_count' => fn ($q) => $q->where('status', PageStatusEnum::Failed),
])->orderBy('url', 'asc')->get();
return view('admin.index', ['instances' => $instances]);
}
}

View file

@ -0,0 +1,127 @@
<?php
declare(strict_types=1);
namespace App\Jobs;
use App\Actions\FetchPageAction;
use App\Actions\RegisterDiscoveredPageAction;
use App\Enums\CrawlOutcomeEnum;
use App\Enums\PageStatusEnum;
use App\Models\PageCrawl;
use App\Services\PolitenessService;
use App\Services\RobotsService;
use App\ValueObjects\FetchResult;
use Illuminate\Contracts\Queue\ShouldQueue;
use Illuminate\Foundation\Queue\Queueable;
use Illuminate\Support\Facades\Cache;
class ProcessCrawlJob implements ShouldQueue
{
use Queueable;
public function __construct(
public PageCrawl $pageCrawl,
) {}
public function handle(): void
{
$robotsService = resolve(RobotsService::class);
if (! $robotsService->isAllowed($this->pageCrawl->page->url)) {
$this->pageCrawl->update([
'outcome' => CrawlOutcomeEnum::BlockedRobots,
'completed_at' => now(),
]);
$this->pageCrawl->page->update(['status' => PageStatusEnum::Failed]);
return;
}
$fetcher = resolve(FetchPageAction::class);
$register = resolve(RegisterDiscoveredPageAction::class);
$politenessService = resolve(PolitenessService::class);
$delay = $politenessService->minDelayFor($this->pageCrawl->domain);
$lock = Cache::lock("crawler:domain:{$this->pageCrawl->domain}", $delay);
if (! $lock->get()) {
$this->release($delay);
return;
}
$result = $fetcher($this->pageCrawl->page->url);
$this->writeOutcome($result);
$this->updatePageStatus($result);
if ($result->outcome->shouldRegisterOutboundLinks()) {
$result->outboundLinks->each(fn (string $url) => $register($url));
}
if ($result->outcome->isRetryable()) {
$this->scheduleRetryIfNeeded();
}
}
private function writeOutcome(FetchResult $result): void
{
$this->pageCrawl->update([
'outcome' => $result->outcome,
'completed_at' => now(),
'status_code' => $result->statusCode,
'error_message' => $result->errorMessage,
]);
}
private function updatePageStatus(FetchResult $result): void
{
$status = $result->outcome->toPageStatus();
$update = match ($status) {
PageStatusEnum::Fetched => [
'status' => $status,
'fetched_at' => now(),
'title' => $result->title,
// Sticky language: only write when detection produced a value, so a re-crawl
// returning null doesn't erase a previously-detected language. Guarding on
// language alone is sufficient because FetchPageAction::detectLanguage()
// always returns the pair as both-null or both-non-null (never mixed).
...($result->language !== null ? [
'language' => $result->language,
'language_confidence' => $result->languageConfidence,
] : []),
],
PageStatusEnum::Failed => [
'status' => $status,
'failed_at' => now(),
],
PageStatusEnum::Rejected => [
'status' => $status,
],
PageStatusEnum::Discovered => [
'status' => $status,
],
};
$this->pageCrawl->page->update($update);
}
private function scheduleRetryIfNeeded(): void
{
if (PageCrawl::where('page_id', $this->pageCrawl->page_id)->count() >= 3) {
return;
}
$newRow = PageCrawl::withoutEvents(
fn () => PageCrawl::create(
array_merge($this->pageCrawl->toArray(), [
'outcome' => null,
])
)
);
ProcessCrawlJob::dispatch($newRow)->delay(now()->addHour());
}
}

View file

@ -0,0 +1,18 @@
<?php
declare(strict_types=1);
namespace App\Listeners;
use App\Services\PollAlertService;
use Lvl0\FediDiscover\Events\PollFailed;
class PollFailedListener
{
public function __construct(private PollAlertService $service) {}
public function handle(PollFailed $event): void
{
$this->service->recordFailure($event->instance, $event->message);
}
}

View file

@ -0,0 +1,36 @@
<?php
declare(strict_types=1);
namespace App\Listeners;
use App\Actions\RegisterDiscoveredPageAction;
use App\Models\PageLink;
use Illuminate\Contracts\Queue\ShouldQueue;
use Illuminate\Support\Facades\DB;
use Lvl0\FediDiscover\Events\UrlDiscovered;
class UrlDiscoveredListener implements ShouldQueue
{
public function __construct(
private RegisterDiscoveredPageAction $registerPage,
) {}
public function handle(UrlDiscovered $event): void
{
DB::transaction(function () use ($event) {
$targetPage = ($this->registerPage)($event->url, $event->instanceId);
if ($event->postUrl === null || $event->postUrl === $event->url) {
return;
}
$sourcePage = ($this->registerPage)($event->postUrl, $event->instanceId);
PageLink::firstOrCreate([
'source_page_id' => $sourcePage->id,
'target_page_id' => $targetPage->id,
]);
});
}
}

View file

@ -0,0 +1,44 @@
<?php
declare(strict_types=1);
namespace App\Livewire;
use App\Actions\RegisterDiscoveredPageAction;
use Illuminate\Contracts\View\View;
use Illuminate\Support\Facades\RateLimiter;
use Livewire\Component;
class UrlSubmissionForm extends Component
{
public string $url = '';
public ?string $confirmedUrl = null;
public function submit(RegisterDiscoveredPageAction $registerPage): void
{
$key = 'submit-url:' . request()->ip();
if (RateLimiter::tooManyAttempts($key, 10)) {
$this->addError('rate_limit', 'Too many submissions, try again shortly.');
return;
}
RateLimiter::hit($key, 60);
$validated = $this->validate([
'url' => ['required', 'url:http,https'],
]);
$registerPage($validated['url']);
$this->confirmedUrl = $validated['url'];
$this->reset('url');
}
public function render(): View
{
return view('livewire.url-submission-form');
}
}

68
app/Models/Page.php Normal file
View file

@ -0,0 +1,68 @@
<?php
declare(strict_types=1);
namespace App\Models;
use App\Enums\PageStatusEnum;
use App\Observers\PageObserver;
use Database\Factories\PageFactory;
use Illuminate\Database\Eloquent\Attributes\ObservedBy;
use Illuminate\Database\Eloquent\Factories\HasFactory;
use Illuminate\Database\Eloquent\Model;
use Illuminate\Database\Eloquent\Relations\BelongsTo;
use Illuminate\Database\Eloquent\Relations\HasMany;
use Illuminate\Database\Eloquent\Relations\HasOne;
use Lvl0\FediDiscover\Models\Instance;
#[ObservedBy([PageObserver::class])]
class Page extends Model
{
/** @use HasFactory<PageFactory> */
use HasFactory;
protected $fillable = [
'url',
'status',
'language',
'language_confidence',
'title',
'instance_id',
'posted_at',
'fetched_at',
'failed_at',
];
protected $casts = [
'status' => PageStatusEnum::class,
'language_confidence' => 'float',
'posted_at' => 'datetime',
'fetched_at' => 'datetime',
'failed_at' => 'datetime',
];
public function instance(): BelongsTo
{
return $this->belongsTo(Instance::class);
}
public function outgoingLinks(): HasMany
{
return $this->hasMany(PageLink::class, 'source_page_id');
}
public function incomingLinks(): HasMany
{
return $this->hasMany(PageLink::class, 'target_page_id');
}
public function crawls(): HasMany
{
return $this->hasMany(PageCrawl::class);
}
public function latestCrawl(): HasOne
{
return $this->hasOne(PageCrawl::class)->latestOfMany('created_at');
}
}

45
app/Models/PageCrawl.php Normal file
View file

@ -0,0 +1,45 @@
<?php
declare(strict_types=1);
namespace App\Models;
use App\Enums\CrawlOutcomeEnum;
use App\Observers\PageCrawlObserver;
use Database\Factories\PageCrawlFactory;
use Illuminate\Database\Eloquent\Attributes\ObservedBy;
use Illuminate\Database\Eloquent\Factories\HasFactory;
use Illuminate\Database\Eloquent\Model;
use Illuminate\Database\Eloquent\Relations\BelongsTo;
#[ObservedBy(PageCrawlObserver::class)]
class PageCrawl extends Model
{
/** @use HasFactory<PageCrawlFactory> */
use HasFactory;
protected $fillable = [
'page_id',
'domain',
'priority',
'completed_at',
'outcome',
'status_code',
'error_message',
];
protected $casts = [
'priority' => 'integer',
'completed_at' => 'datetime',
'outcome' => CrawlOutcomeEnum::class,
'status_code' => 'integer',
];
/**
* @return BelongsTo<Page, $this>
*/
public function page(): BelongsTo
{
return $this->belongsTo(Page::class);
}
}

31
app/Models/PageLink.php Normal file
View file

@ -0,0 +1,31 @@
<?php
declare(strict_types=1);
namespace App\Models;
use Database\Factories\PageLinkFactory;
use Illuminate\Database\Eloquent\Factories\HasFactory;
use Illuminate\Database\Eloquent\Model;
use Illuminate\Database\Eloquent\Relations\BelongsTo;
class PageLink extends Model
{
/** @use HasFactory<PageLinkFactory> */
use HasFactory;
protected $fillable = [
'source_page_id',
'target_page_id',
];
public function sourcePage(): BelongsTo
{
return $this->belongsTo(Page::class, 'source_page_id');
}
public function targetPage(): BelongsTo
{
return $this->belongsTo(Page::class, 'target_page_id');
}
}

View file

@ -0,0 +1,14 @@
<?php
namespace App\Observers;
use App\Jobs\ProcessCrawlJob;
use App\Models\PageCrawl;
class PageCrawlObserver
{
public function created(PageCrawl $pageCrawl): void
{
ProcessCrawlJob::dispatch($pageCrawl);
}
}

View file

@ -0,0 +1,25 @@
<?php
declare(strict_types=1);
namespace App\Observers;
use App\Models\Page;
use App\Models\PageCrawl;
use App\Services\UrlService;
class PageObserver
{
public function __construct(private UrlService $urlService) {}
public function created(Page $page): void
{
PageCrawl::firstOrCreate(
['page_id' => $page->id],
[
'domain' => $this->urlService->host($page->url),
'priority' => 0,
],
);
}
}

View file

@ -2,23 +2,24 @@
namespace App\Providers; namespace App\Providers;
use App\Listeners\PollFailedListener;
use App\Listeners\UrlDiscoveredListener;
use App\Services\LanguageDetectionService;
use Illuminate\Support\Facades\Event;
use Illuminate\Support\ServiceProvider; use Illuminate\Support\ServiceProvider;
use Lvl0\FediDiscover\Events\PollFailed;
use Lvl0\FediDiscover\Events\UrlDiscovered;
class AppServiceProvider extends ServiceProvider class AppServiceProvider extends ServiceProvider
{ {
/**
* Register any application services.
*/
public function register(): void public function register(): void
{ {
// $this->app->singleton(LanguageDetectionService::class);
} }
/**
* Bootstrap any application services.
*/
public function boot(): void public function boot(): void
{ {
// Event::listen(UrlDiscovered::class, UrlDiscoveredListener::class);
Event::listen(PollFailed::class, PollFailedListener::class);
} }
} }

View file

@ -0,0 +1,39 @@
<?php
declare(strict_types=1);
namespace App\Services;
use LanguageDetection\Language;
class LanguageDetectionService
{
private Language $language;
public function __construct()
{
$this->language = new Language;
}
/**
* @return array{0: string, 1: float}|null
*/
public function detect(string $text): ?array
{
if (trim($text) === '') {
return null;
}
$languages = $this->language->detect($text)->bestResults()->close();
if ($languages === []) {
return null;
}
// bestResults() keeps every candidate within 0.025 of the top score.
// array_key_first picks the highest-ranked one (arsort'd by the library).
$code = array_key_first($languages);
return [$code, $languages[$code]];
}
}

View file

@ -0,0 +1,19 @@
<?php
declare(strict_types=1);
namespace App\Services;
class PolitenessService
{
public function minDelayFor(string $domain): int
{
/** @var RobotsService $robotsService */
$robotsService = resolve(RobotsService::class);
$crawlDelay = $robotsService->crawlDelayFor($domain, config('crawler.user_agent'));
$configValue = config('crawler.min_domain_delay_seconds', 10);
return max($crawlDelay ?? 0, $configValue);
}
}

View file

@ -0,0 +1,38 @@
<?php
declare(strict_types=1);
namespace App\Services;
use Exception;
use Illuminate\Support\Facades\Http;
use Lvl0\FediDiscover\Models\Instance;
class PollAlertService
{
public function recordFailure(Instance $instance, string $message): void
{
$instance->increment('consecutive_poll_failures');
$instance->refresh();
$ntfyUrl = config('services.ntfy.url');
$ntfyThreshold = config('services.ntfy.threshold');
$ntfyTopic = config('services.ntfy.topic');
if ($ntfyUrl === null || $ntfyThreshold === null || $ntfyThreshold === 0 || $ntfyTopic === null) {
return;
}
if ($instance->consecutive_poll_failures < $ntfyThreshold) {
return;
}
try {
Http::timeout(5)
->withBody($instance->url . ' - ' . $message, 'text/plain')
->post(rtrim($ntfyUrl, '/') . '/' . $ntfyTopic);
} catch (Exception $e) {
logger()->warning('ntfy alert failed', ['instance' => $instance->url, 'error' => $e->getMessage()]);
}
}
}

View file

@ -0,0 +1,60 @@
<?php
declare(strict_types=1);
namespace App\Services;
use Illuminate\Http\Client\ConnectionException;
use Illuminate\Support\Facades\Cache;
use Illuminate\Support\Facades\Http;
use Spatie\Robots\RobotsTxt;
class RobotsService
{
public function __construct(
private UrlService $urlService,
) {}
public function isAllowed(string $url, ?string $userAgent = null): bool
{
$host = $this->urlService->host($url);
$path = parse_url($url, PHP_URL_PATH) ?? '/';
$body = Cache::remember(
"crawler:robots:{$host}",
config('crawler.robots_cache_ttl_seconds'),
function () use ($host) {
try {
$response = Http::get("https://{$host}/robots.txt");
return $response->successful() ? $response->body() : '';
} catch (ConnectionException) {
return '';
}
}
);
return (new RobotsTxt($body))->allows($path, $userAgent);
}
public function crawlDelayFor(string $host, string $userAgent): ?int
{
$body = Cache::remember(
"crawler:robots:{$host}",
config('crawler.robots_cache_ttl_seconds'),
function () use ($host) {
try {
$response = Http::get("https://{$host}/robots.txt");
return $response->successful() ? $response->body() : '';
} catch (ConnectionException) {
return '';
}
}
);
$delay = (new RobotsTxt($body))->crawlDelay($userAgent);
return $delay !== null ? (int) $delay : null;
}
}

View file

@ -0,0 +1,40 @@
<?php
declare(strict_types=1);
namespace App\Services;
use Illuminate\Support\Uri;
use InvalidArgumentException;
class UrlService
{
public function host(string $url): string
{
$uri = Uri::of($url);
$scheme = $uri->scheme();
if ($scheme === null || $scheme === '') {
throw new InvalidArgumentException("URL has no scheme: {$url}");
}
if (! in_array($scheme, ['http', 'https'], true)) {
throw new InvalidArgumentException("Invalid URL scheme: {$scheme}");
}
if ($uri->user() !== null) {
throw new InvalidArgumentException("URLs with embedded credentials not allowed: {$url}");
}
$host = $uri->host();
if ($host === null || $host === '') {
throw new InvalidArgumentException("URL has no host: {$url}");
}
$bareHost = preg_replace('/%.*$/', '', trim($host, '[]'));
if (filter_var($bareHost, FILTER_VALIDATE_IP) !== false) {
throw new InvalidArgumentException("IP literal hosts not allowed: {$host}");
}
return mb_strtolower($host);
}
}

View file

@ -0,0 +1,28 @@
<?php
declare(strict_types=1);
namespace App\ValueObjects;
use App\Enums\CrawlOutcomeEnum;
use Illuminate\Support\Collection;
final readonly class FetchResult
{
/**
* @param ?string $finalUrl Set to the request URL in v0.1; true post-redirect URL tracking is deferred (see ticket #12 spec). Downstream consumers MUST NOT trust this field as the post-redirect location until that lands.
* @param Collection<int, string> $outboundLinks
*/
public function __construct(
public CrawlOutcomeEnum $outcome,
public ?int $statusCode,
public ?string $finalUrl,
public ?string $title,
public ?string $extractedText,
public Collection $outboundLinks,
public ?int $wordCount,
public ?string $errorMessage,
public ?string $language = null,
public ?float $languageConfidence = null,
) {}
}

View file

@ -3,6 +3,7 @@
use Illuminate\Foundation\Application; use Illuminate\Foundation\Application;
use Illuminate\Foundation\Configuration\Exceptions; use Illuminate\Foundation\Configuration\Exceptions;
use Illuminate\Foundation\Configuration\Middleware; use Illuminate\Foundation\Configuration\Middleware;
use Illuminate\Http\Request;
return Application::configure(basePath: dirname(__DIR__)) return Application::configure(basePath: dirname(__DIR__))
->withRouting( ->withRouting(
@ -11,7 +12,11 @@
health: '/up', health: '/up',
) )
->withMiddleware(function (Middleware $middleware): void { ->withMiddleware(function (Middleware $middleware): void {
// $middleware->trustProxies(
at: '*',
headers: Request::HEADER_X_FORWARDED_FOR
| Request::HEADER_X_FORWARDED_PROTO,
);
}) })
->withExceptions(function (Exceptions $exceptions): void { ->withExceptions(function (Exceptions $exceptions): void {
// //

View file

@ -16,10 +16,14 @@
], ],
"require": { "require": {
"php": "^8.3", "php": "^8.3",
"fivefilters/readability.php": "^3.3",
"laravel/framework": "^13.0", "laravel/framework": "^13.0",
"laravel/tinker": "^3.0", "laravel/tinker": "^3.0",
"livewire/livewire": "^4.2", "livewire/livewire": "^4.2",
"lvl0/fedi-discover": "@dev" "lvl0/fedi-discover": "@dev",
"patrickschur/language-detection": "^5.3",
"spatie/robots-txt": "^2.5",
"symfony/dom-crawler": "^7.4"
}, },
"require-dev": { "require-dev": {
"fakerphp/faker": "^1.23", "fakerphp/faker": "^1.23",

387
composer.lock generated
View file

@ -4,7 +4,7 @@
"Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies", "Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies",
"This file is @generated automatically" "This file is @generated automatically"
], ],
"content-hash": "e46e58784ec34415557c78db6bb6c97e", "content-hash": "4d6e239c94fea8e9511f1e73f05db1df",
"packages": [ "packages": [
{ {
"name": "brick/math", "name": "brick/math",
@ -508,6 +508,71 @@
], ],
"time": "2025-03-06T22:45:56+00:00" "time": "2025-03-06T22:45:56+00:00"
}, },
{
"name": "fivefilters/readability.php",
"version": "v3.3.3",
"source": {
"type": "git",
"url": "https://github.com/fivefilters/readability.php.git",
"reference": "e2ee7b9e49eae89ac7ed2c74b15718100a73b4c8"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/fivefilters/readability.php/zipball/e2ee7b9e49eae89ac7ed2c74b15718100a73b4c8",
"reference": "e2ee7b9e49eae89ac7ed2c74b15718100a73b4c8",
"shasum": ""
},
"require": {
"ext-dom": "*",
"ext-mbstring": "*",
"ext-xml": "*",
"league/uri": "^7.0",
"masterminds/html5": "^2.0",
"php": ">=8.1",
"psr/log": "^1.0 || ^2.0 || ^3.0"
},
"require-dev": {
"monolog/monolog": "^3.0",
"phpunit/phpunit": "^10.0 || ^11.0"
},
"suggest": {
"monolog/monolog": "Allow logging debug information"
},
"type": "library",
"autoload": {
"psr-4": {
"fivefilters\\Readability\\": "src/"
}
},
"notification-url": "https://packagist.org/downloads/",
"license": [
"Apache-2.0"
],
"authors": [
{
"name": "Andres Rey",
"email": "andreskrey@gmail.com",
"role": "Original Developer"
},
{
"name": "Keyvan Minoukadeh",
"email": "keyvan@fivefilters.org",
"homepage": "https://www.fivefilters.org",
"role": "Developer/Maintainer"
}
],
"description": "A PHP port of Readability.js",
"homepage": "https://github.com/fivefilters/readability.php",
"keywords": [
"html",
"readability"
],
"support": {
"issues": "https://github.com/fivefilters/readability.php/issues",
"source": "https://github.com/fivefilters/readability.php/tree/v3.3.3"
},
"time": "2025-04-26T23:45:37+00:00"
},
{ {
"name": "fruitcake/php-cors", "name": "fruitcake/php-cors",
"version": "v1.4.0", "version": "v1.4.0",
@ -2102,7 +2167,7 @@
}, },
{ {
"name": "lvl0/fedi-discover", "name": "lvl0/fedi-discover",
"version": "dev-main", "version": "dev-release/0.1.0",
"dist": { "dist": {
"type": "path", "type": "path",
"url": "packages/Lvl0/FediDiscover", "url": "packages/Lvl0/FediDiscover",
@ -2142,6 +2207,73 @@
"relative": true "relative": true
} }
}, },
{
"name": "masterminds/html5",
"version": "2.10.0",
"source": {
"type": "git",
"url": "https://github.com/Masterminds/html5-php.git",
"reference": "fcf91eb64359852f00d921887b219479b4f21251"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/Masterminds/html5-php/zipball/fcf91eb64359852f00d921887b219479b4f21251",
"reference": "fcf91eb64359852f00d921887b219479b4f21251",
"shasum": ""
},
"require": {
"ext-dom": "*",
"php": ">=5.3.0"
},
"require-dev": {
"phpunit/phpunit": "^4.8.35 || ^5.7.21 || ^6 || ^7 || ^8 || ^9"
},
"type": "library",
"extra": {
"branch-alias": {
"dev-master": "2.7-dev"
}
},
"autoload": {
"psr-4": {
"Masterminds\\": "src"
}
},
"notification-url": "https://packagist.org/downloads/",
"license": [
"MIT"
],
"authors": [
{
"name": "Matt Butcher",
"email": "technosophos@gmail.com"
},
{
"name": "Matt Farina",
"email": "matt@mattfarina.com"
},
{
"name": "Asmir Mustafic",
"email": "goetas@gmail.com"
}
],
"description": "An HTML5 parser and serializer.",
"homepage": "http://masterminds.github.io/html5-php",
"keywords": [
"HTML5",
"dom",
"html",
"parser",
"querypath",
"serializer",
"xml"
],
"support": {
"issues": "https://github.com/Masterminds/html5-php/issues",
"source": "https://github.com/Masterminds/html5-php/tree/2.10.0"
},
"time": "2025-07-25T09:04:22+00:00"
},
{ {
"name": "monolog/monolog", "name": "monolog/monolog",
"version": "3.10.0", "version": "3.10.0",
@ -2653,6 +2785,57 @@
], ],
"time": "2026-02-16T23:10:27+00:00" "time": "2026-02-16T23:10:27+00:00"
}, },
{
"name": "patrickschur/language-detection",
"version": "v5.3.1",
"source": {
"type": "git",
"url": "https://github.com/patrickschur/language-detection.git",
"reference": "df8d32021b2ef9fde52e6fcccb83e3806822c9c6"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/patrickschur/language-detection/zipball/df8d32021b2ef9fde52e6fcccb83e3806822c9c6",
"reference": "df8d32021b2ef9fde52e6fcccb83e3806822c9c6",
"shasum": ""
},
"require": {
"ext-json": "*",
"ext-mbstring": "*",
"php": "^7.4 || ^8.0"
},
"require-dev": {
"phpunit/phpunit": "^9.5.0"
},
"type": "library",
"autoload": {
"psr-4": {
"LanguageDetection\\": "src/LanguageDetection"
}
},
"notification-url": "https://packagist.org/downloads/",
"license": [
"MIT"
],
"authors": [
{
"name": "Patrick Schur",
"email": "patrick_schur@outlook.de"
}
],
"description": "A language detection library for PHP. Detects the language from a given text string.",
"homepage": "https://github.com/patrickschur/language-detection",
"keywords": [
"detect",
"detection",
"language"
],
"support": {
"issues": "https://github.com/patrickschur/language-detection/issues",
"source": "https://github.com/patrickschur/language-detection/tree/v5.3.1"
},
"time": "2025-03-25T22:47:08+00:00"
},
{ {
"name": "phpoption/phpoption", "name": "phpoption/phpoption",
"version": "1.9.5", "version": "1.9.5",
@ -3417,6 +3600,66 @@
}, },
"time": "2025-12-14T04:43:48+00:00" "time": "2025-12-14T04:43:48+00:00"
}, },
{
"name": "spatie/robots-txt",
"version": "2.5.4",
"source": {
"type": "git",
"url": "https://github.com/spatie/robots-txt.git",
"reference": "a8dd35d0a94e863f52509a366a634978e9c1db03"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/spatie/robots-txt/zipball/a8dd35d0a94e863f52509a366a634978e9c1db03",
"reference": "a8dd35d0a94e863f52509a366a634978e9c1db03",
"shasum": ""
},
"require": {
"php": "^8.1"
},
"require-dev": {
"phpunit/phpunit": "^11.5.2"
},
"type": "library",
"autoload": {
"psr-4": {
"Spatie\\Robots\\": "src"
}
},
"notification-url": "https://packagist.org/downloads/",
"license": [
"MIT"
],
"authors": [
{
"name": "Brent Roose",
"email": "brent@spatie.be",
"homepage": "https://spatie.be",
"role": "Developer"
}
],
"description": "Determine if a page may be crawled from robots.txt and robots meta tags",
"homepage": "https://github.com/spatie/robots-txt",
"keywords": [
"robots-txt",
"spatie"
],
"support": {
"issues": "https://github.com/spatie/robots-txt/issues",
"source": "https://github.com/spatie/robots-txt/tree/2.5.4"
},
"funding": [
{
"url": "https://spatie.be/open-source/support-us",
"type": "custom"
},
{
"url": "https://github.com/spatie",
"type": "github"
}
],
"time": "2026-02-25T07:59:20+00:00"
},
{ {
"name": "symfony/clock", "name": "symfony/clock",
"version": "v7.4.8", "version": "v7.4.8",
@ -3729,6 +3972,78 @@
], ],
"time": "2024-09-25T14:21:43+00:00" "time": "2024-09-25T14:21:43+00:00"
}, },
{
"name": "symfony/dom-crawler",
"version": "v7.4.8",
"source": {
"type": "git",
"url": "https://github.com/symfony/dom-crawler.git",
"reference": "2918e7c2ba964defca1f5b69c6f74886529e2dc8"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/symfony/dom-crawler/zipball/2918e7c2ba964defca1f5b69c6f74886529e2dc8",
"reference": "2918e7c2ba964defca1f5b69c6f74886529e2dc8",
"shasum": ""
},
"require": {
"masterminds/html5": "^2.6",
"php": ">=8.2",
"symfony/deprecation-contracts": "^2.5|^3",
"symfony/polyfill-ctype": "~1.8",
"symfony/polyfill-mbstring": "~1.0"
},
"require-dev": {
"symfony/css-selector": "^6.4|^7.0|^8.0"
},
"type": "library",
"autoload": {
"psr-4": {
"Symfony\\Component\\DomCrawler\\": ""
},
"exclude-from-classmap": [
"/Tests/"
]
},
"notification-url": "https://packagist.org/downloads/",
"license": [
"MIT"
],
"authors": [
{
"name": "Fabien Potencier",
"email": "fabien@symfony.com"
},
{
"name": "Symfony Community",
"homepage": "https://symfony.com/contributors"
}
],
"description": "Eases DOM navigation for HTML and XML documents",
"homepage": "https://symfony.com",
"support": {
"source": "https://github.com/symfony/dom-crawler/tree/v7.4.8"
},
"funding": [
{
"url": "https://symfony.com/sponsor",
"type": "custom"
},
{
"url": "https://github.com/fabpot",
"type": "github"
},
{
"url": "https://github.com/nicolas-grekas",
"type": "github"
},
{
"url": "https://tidelift.com/funding/github/packagist/symfony/symfony",
"type": "tidelift"
}
],
"time": "2026-03-24T13:12:05+00:00"
},
{ {
"name": "symfony/error-handler", "name": "symfony/error-handler",
"version": "v7.4.8", "version": "v7.4.8",
@ -4416,7 +4731,7 @@
}, },
{ {
"name": "symfony/polyfill-ctype", "name": "symfony/polyfill-ctype",
"version": "v1.36.0", "version": "v1.37.0",
"source": { "source": {
"type": "git", "type": "git",
"url": "https://github.com/symfony/polyfill-ctype.git", "url": "https://github.com/symfony/polyfill-ctype.git",
@ -4475,7 +4790,7 @@
"portable" "portable"
], ],
"support": { "support": {
"source": "https://github.com/symfony/polyfill-ctype/tree/v1.36.0" "source": "https://github.com/symfony/polyfill-ctype/tree/v1.37.0"
}, },
"funding": [ "funding": [
{ {
@ -4499,16 +4814,16 @@
}, },
{ {
"name": "symfony/polyfill-intl-grapheme", "name": "symfony/polyfill-intl-grapheme",
"version": "v1.36.0", "version": "v1.37.0",
"source": { "source": {
"type": "git", "type": "git",
"url": "https://github.com/symfony/polyfill-intl-grapheme.git", "url": "https://github.com/symfony/polyfill-intl-grapheme.git",
"reference": "ad1b7b9092976d6c948b8a187cec9faaea9ec1df" "reference": "4864388bfbd3001ce88e234fab652acd91fdc57e"
}, },
"dist": { "dist": {
"type": "zip", "type": "zip",
"url": "https://api.github.com/repos/symfony/polyfill-intl-grapheme/zipball/ad1b7b9092976d6c948b8a187cec9faaea9ec1df", "url": "https://api.github.com/repos/symfony/polyfill-intl-grapheme/zipball/4864388bfbd3001ce88e234fab652acd91fdc57e",
"reference": "ad1b7b9092976d6c948b8a187cec9faaea9ec1df", "reference": "4864388bfbd3001ce88e234fab652acd91fdc57e",
"shasum": "" "shasum": ""
}, },
"require": { "require": {
@ -4557,7 +4872,7 @@
"shim" "shim"
], ],
"support": { "support": {
"source": "https://github.com/symfony/polyfill-intl-grapheme/tree/v1.36.0" "source": "https://github.com/symfony/polyfill-intl-grapheme/tree/v1.37.0"
}, },
"funding": [ "funding": [
{ {
@ -4577,11 +4892,11 @@
"type": "tidelift" "type": "tidelift"
} }
], ],
"time": "2026-04-10T16:19:22+00:00" "time": "2026-04-26T13:13:48+00:00"
}, },
{ {
"name": "symfony/polyfill-intl-idn", "name": "symfony/polyfill-intl-idn",
"version": "v1.36.0", "version": "v1.37.0",
"source": { "source": {
"type": "git", "type": "git",
"url": "https://github.com/symfony/polyfill-intl-idn.git", "url": "https://github.com/symfony/polyfill-intl-idn.git",
@ -4644,7 +4959,7 @@
"shim" "shim"
], ],
"support": { "support": {
"source": "https://github.com/symfony/polyfill-intl-idn/tree/v1.36.0" "source": "https://github.com/symfony/polyfill-intl-idn/tree/v1.37.0"
}, },
"funding": [ "funding": [
{ {
@ -4668,7 +4983,7 @@
}, },
{ {
"name": "symfony/polyfill-intl-normalizer", "name": "symfony/polyfill-intl-normalizer",
"version": "v1.36.0", "version": "v1.37.0",
"source": { "source": {
"type": "git", "type": "git",
"url": "https://github.com/symfony/polyfill-intl-normalizer.git", "url": "https://github.com/symfony/polyfill-intl-normalizer.git",
@ -4729,7 +5044,7 @@
"shim" "shim"
], ],
"support": { "support": {
"source": "https://github.com/symfony/polyfill-intl-normalizer/tree/v1.36.0" "source": "https://github.com/symfony/polyfill-intl-normalizer/tree/v1.37.0"
}, },
"funding": [ "funding": [
{ {
@ -4753,7 +5068,7 @@
}, },
{ {
"name": "symfony/polyfill-mbstring", "name": "symfony/polyfill-mbstring",
"version": "v1.36.0", "version": "v1.37.0",
"source": { "source": {
"type": "git", "type": "git",
"url": "https://github.com/symfony/polyfill-mbstring.git", "url": "https://github.com/symfony/polyfill-mbstring.git",
@ -4814,7 +5129,7 @@
"shim" "shim"
], ],
"support": { "support": {
"source": "https://github.com/symfony/polyfill-mbstring/tree/v1.36.0" "source": "https://github.com/symfony/polyfill-mbstring/tree/v1.37.0"
}, },
"funding": [ "funding": [
{ {
@ -4838,7 +5153,7 @@
}, },
{ {
"name": "symfony/polyfill-php80", "name": "symfony/polyfill-php80",
"version": "v1.36.0", "version": "v1.37.0",
"source": { "source": {
"type": "git", "type": "git",
"url": "https://github.com/symfony/polyfill-php80.git", "url": "https://github.com/symfony/polyfill-php80.git",
@ -4898,7 +5213,7 @@
"shim" "shim"
], ],
"support": { "support": {
"source": "https://github.com/symfony/polyfill-php80/tree/v1.36.0" "source": "https://github.com/symfony/polyfill-php80/tree/v1.37.0"
}, },
"funding": [ "funding": [
{ {
@ -4922,7 +5237,7 @@
}, },
{ {
"name": "symfony/polyfill-php83", "name": "symfony/polyfill-php83",
"version": "v1.36.0", "version": "v1.37.0",
"source": { "source": {
"type": "git", "type": "git",
"url": "https://github.com/symfony/polyfill-php83.git", "url": "https://github.com/symfony/polyfill-php83.git",
@ -4978,7 +5293,7 @@
"shim" "shim"
], ],
"support": { "support": {
"source": "https://github.com/symfony/polyfill-php83/tree/v1.36.0" "source": "https://github.com/symfony/polyfill-php83/tree/v1.37.0"
}, },
"funding": [ "funding": [
{ {
@ -5002,7 +5317,7 @@
}, },
{ {
"name": "symfony/polyfill-php84", "name": "symfony/polyfill-php84",
"version": "v1.36.0", "version": "v1.37.0",
"source": { "source": {
"type": "git", "type": "git",
"url": "https://github.com/symfony/polyfill-php84.git", "url": "https://github.com/symfony/polyfill-php84.git",
@ -5058,7 +5373,7 @@
"shim" "shim"
], ],
"support": { "support": {
"source": "https://github.com/symfony/polyfill-php84/tree/v1.36.0" "source": "https://github.com/symfony/polyfill-php84/tree/v1.37.0"
}, },
"funding": [ "funding": [
{ {
@ -5082,16 +5397,16 @@
}, },
{ {
"name": "symfony/polyfill-php85", "name": "symfony/polyfill-php85",
"version": "v1.36.0", "version": "v1.37.0",
"source": { "source": {
"type": "git", "type": "git",
"url": "https://github.com/symfony/polyfill-php85.git", "url": "https://github.com/symfony/polyfill-php85.git",
"reference": "2c408a6bb0313e6001a83628dc5506100474254e" "reference": "fcfa4973a9917cef23f2e38774da74a2b7d115ee"
}, },
"dist": { "dist": {
"type": "zip", "type": "zip",
"url": "https://api.github.com/repos/symfony/polyfill-php85/zipball/2c408a6bb0313e6001a83628dc5506100474254e", "url": "https://api.github.com/repos/symfony/polyfill-php85/zipball/fcfa4973a9917cef23f2e38774da74a2b7d115ee",
"reference": "2c408a6bb0313e6001a83628dc5506100474254e", "reference": "fcfa4973a9917cef23f2e38774da74a2b7d115ee",
"shasum": "" "shasum": ""
}, },
"require": { "require": {
@ -5138,7 +5453,7 @@
"shim" "shim"
], ],
"support": { "support": {
"source": "https://github.com/symfony/polyfill-php85/tree/v1.36.0" "source": "https://github.com/symfony/polyfill-php85/tree/v1.37.0"
}, },
"funding": [ "funding": [
{ {
@ -5158,11 +5473,11 @@
"type": "tidelift" "type": "tidelift"
} }
], ],
"time": "2026-04-10T16:50:15+00:00" "time": "2026-04-26T13:10:57+00:00"
}, },
{ {
"name": "symfony/polyfill-uuid", "name": "symfony/polyfill-uuid",
"version": "v1.36.0", "version": "v1.37.0",
"source": { "source": {
"type": "git", "type": "git",
"url": "https://github.com/symfony/polyfill-uuid.git", "url": "https://github.com/symfony/polyfill-uuid.git",
@ -5221,7 +5536,7 @@
"uuid" "uuid"
], ],
"support": { "support": {
"source": "https://github.com/symfony/polyfill-uuid/tree/v1.36.0" "source": "https://github.com/symfony/polyfill-uuid/tree/v1.37.0"
}, },
"funding": [ "funding": [
{ {
@ -6059,16 +6374,16 @@
}, },
{ {
"name": "voku/portable-ascii", "name": "voku/portable-ascii",
"version": "2.1.0", "version": "2.1.1",
"source": { "source": {
"type": "git", "type": "git",
"url": "https://github.com/voku/portable-ascii.git", "url": "https://github.com/voku/portable-ascii.git",
"reference": "d870a33f0f79d2b4579740b0620200221ee44aeb" "reference": "8e1051fe39379367aecf014f41744ce7539a856f"
}, },
"dist": { "dist": {
"type": "zip", "type": "zip",
"url": "https://api.github.com/repos/voku/portable-ascii/zipball/d870a33f0f79d2b4579740b0620200221ee44aeb", "url": "https://api.github.com/repos/voku/portable-ascii/zipball/8e1051fe39379367aecf014f41744ce7539a856f",
"reference": "d870a33f0f79d2b4579740b0620200221ee44aeb", "reference": "8e1051fe39379367aecf014f41744ce7539a856f",
"shasum": "" "shasum": ""
}, },
"require": { "require": {
@ -6105,7 +6420,7 @@
], ],
"support": { "support": {
"issues": "https://github.com/voku/portable-ascii/issues", "issues": "https://github.com/voku/portable-ascii/issues",
"source": "https://github.com/voku/portable-ascii/tree/2.1.0" "source": "https://github.com/voku/portable-ascii/tree/2.1.1"
}, },
"funding": [ "funding": [
{ {
@ -6129,7 +6444,7 @@
"type": "tidelift" "type": "tidelift"
} }
], ],
"time": "2026-04-16T23:10:39+00:00" "time": "2026-04-26T05:33:54+00:00"
} }
], ],
"packages-dev": [ "packages-dev": [

47
config/crawler.php Normal file
View file

@ -0,0 +1,47 @@
<?php
declare(strict_types=1);
return [
/*
|---------------------------------------------------------------------------
| HTTP timeout (seconds)
|---------------------------------------------------------------------------
|
| Hard cap on a single fetch. Guzzle's default is 0 (wait forever) never
| acceptable for a crawler. Tune up cautiously; longer timeouts amplify the
| impact of slow targets on overall throughput.
|
*/
'timeout' => env('CRAWLER_TIMEOUT', 10),
/*
|---------------------------------------------------------------------------
| Maximum redirects to follow
|---------------------------------------------------------------------------
|
| Guzzle default is 5. Cross-origin redirects are accepted in v0.1 the
| search engine treats the post-redirect URL as the canonical one for
| indexing.
|
*/
'max_redirects' => env('CRAWLER_MAX_REDIRECTS', 5),
/*
|---------------------------------------------------------------------------
| User-Agent
|---------------------------------------------------------------------------
|
| Identifies our crawler to target servers. The placeholder below is for
| v0.1 development; ticket #10 replaces it with the production identity
| and adds a `/bot` info page that the URL points at.
|
*/
'user_agent' => env('CRAWLER_USER_AGENT', 'TroveBot/0.1 (+https://trove.lvl0.xyz/bot)'),
'min_domain_delay_seconds' => env('CRAWLER_MIN_DOMAIN_DELAY_SECONDS', 10),
'robots_cache_ttl_seconds' => env('CRAWLER_ROBOTS_CACHE_TTL_SECONDS', 60 * 60 * 24),
];

282
config/livewire.php Normal file
View file

@ -0,0 +1,282 @@
<?php
return [
/*
|---------------------------------------------------------------------------
| Component Locations
|---------------------------------------------------------------------------
|
| This value sets the root directories that'll be used to resolve view-based
| components like single and multi-file components. The make command will
| use the first directory in this array to add new component files to.
|
*/
'component_locations' => [
resource_path('views/components'),
resource_path('views/livewire'),
],
/*
|---------------------------------------------------------------------------
| Component Namespaces
|---------------------------------------------------------------------------
|
| This value sets default namespaces that will be used to resolve view-based
| components like single-file and multi-file components. These folders'll
| also be referenced when creating new components via the make command.
|
*/
'component_namespaces' => [
'layouts' => resource_path('views/layouts'),
'pages' => resource_path('views/pages'),
],
/*
|---------------------------------------------------------------------------
| Page Layout
|---------------------------------------------------------------------------
| The view that will be used as the layout when rendering a single component as
| an entire page via `Route::livewire('/post/create', 'pages::create-post')`.
| In this case, the content of pages::create-post will render into $slot.
|
*/
'component_layout' => 'layouts::app',
/*
|---------------------------------------------------------------------------
| Lazy Loading Placeholder
|---------------------------------------------------------------------------
| Livewire allows you to lazy load components that would otherwise slow down
| the initial page load. Every component can have a custom placeholder or
| you can define the default placeholder view for all components below.
|
*/
'component_placeholder' => null, // Example: 'placeholders::skeleton'
/*
|---------------------------------------------------------------------------
| Make Command
|---------------------------------------------------------------------------
| This value determines the default configuration for the artisan make command
| You can configure the component type (sfc, mfc, class) and whether to use
| the high-voltage () emoji as a prefix in the sfc|mfc component names.
|
*/
'make_command' => [
'type' => 'class', // Options: 'sfc', 'mfc', 'class'
'emoji' => false, // Options: true, false
'with' => [
'js' => false,
'css' => false,
'test' => false,
],
],
/*
|---------------------------------------------------------------------------
| Class Namespace
|---------------------------------------------------------------------------
|
| This value sets the root class namespace for Livewire component classes in
| your application. This value will change where component auto-discovery
| finds components. It's also referenced by the file creation commands.
|
*/
'class_namespace' => 'App\\Livewire',
/*
|---------------------------------------------------------------------------
| Class Path
|---------------------------------------------------------------------------
|
| This value is used to specify the path where Livewire component class files
| are created when running creation commands like `artisan make:livewire`.
| This path is customizable to match your projects directory structure.
|
*/
'class_path' => app_path('Livewire'),
/*
|---------------------------------------------------------------------------
| View Path
|---------------------------------------------------------------------------
|
| This value is used to specify where Livewire component Blade templates are
| stored when running file creation commands like `artisan make:livewire`.
| It is also used if you choose to omit a component's render() method.
|
*/
'view_path' => resource_path('views/livewire'),
/*
|---------------------------------------------------------------------------
| Temporary File Uploads
|---------------------------------------------------------------------------
|
| Livewire handles file uploads by storing uploads in a temporary directory
| before the file is stored permanently. All file uploads are directed to
| a global endpoint for temporary storage. You may configure this below:
|
*/
'temporary_file_upload' => [
'disk' => env('LIVEWIRE_TEMPORARY_FILE_UPLOAD_DISK'), // Example: 'local', 's3' | Default: 'default'
'rules' => null, // Example: ['file', 'mimes:png,jpg'] | Default: ['required', 'file', 'max:12288'] (12MB)
'directory' => null, // Example: 'tmp' | Default: 'livewire-tmp'
'middleware' => null, // Example: 'throttle:5,1' | Default: 'throttle:60,1'
'preview_mimes' => [ // Supported file types for temporary pre-signed file URLs...
'png', 'gif', 'bmp', 'svg', 'wav', 'mp4',
'mov', 'avi', 'wmv', 'mp3', 'm4a',
'jpg', 'jpeg', 'mpga', 'webp', 'wma',
],
'max_upload_time' => 5, // Max duration (in minutes) before an upload is invalidated...
'cleanup' => true, // Should cleanup temporary uploads older than 24 hrs...
],
/*
|---------------------------------------------------------------------------
| Render On Redirect
|---------------------------------------------------------------------------
|
| This value determines if Livewire will run a component's `render()` method
| after a redirect has been triggered using something like `redirect(...)`
| Setting this to true will render the view once more before redirecting
|
*/
'render_on_redirect' => false,
/*
|---------------------------------------------------------------------------
| Eloquent Model Binding
|---------------------------------------------------------------------------
|
| Previous versions of Livewire supported binding directly to eloquent model
| properties using wire:model by default. However, this behavior has been
| deemed too "magical" and has therefore been put under a feature flag.
|
*/
'legacy_model_binding' => false,
/*
|---------------------------------------------------------------------------
| Auto-inject Frontend Assets
|---------------------------------------------------------------------------
|
| By default, Livewire automatically injects its JavaScript and CSS into the
| <head> and <body> of pages containing Livewire components. By disabling
| this behavior, you need to use @livewireStyles and @livewireScripts.
|
*/
'inject_assets' => true,
/*
|---------------------------------------------------------------------------
| Navigate (SPA mode)
|---------------------------------------------------------------------------
|
| By adding `wire:navigate` to links in your Livewire application, Livewire
| will prevent the default link handling and instead request those pages
| via AJAX, creating an SPA-like effect. Configure this behavior here.
|
*/
'navigate' => [
'show_progress_bar' => true,
'progress_bar_color' => '#2299dd',
],
/*
|---------------------------------------------------------------------------
| HTML Morph Markers
|---------------------------------------------------------------------------
|
| Livewire intelligently "morphs" existing HTML into the newly rendered HTML
| after each update. To make this process more reliable, Livewire injects
| "markers" into the rendered Blade surrounding @if, @class & @foreach.
|
*/
'inject_morph_markers' => true,
/*
|---------------------------------------------------------------------------
| Smart Wire Keys
|---------------------------------------------------------------------------
|
| Livewire uses loops and keys used within loops to generate smart keys that
| are applied to nested components that don't have them. This makes using
| nested components more reliable by ensuring that they all have keys.
|
*/
'smart_wire_keys' => true,
/*
|---------------------------------------------------------------------------
| Pagination Theme
|---------------------------------------------------------------------------
|
| When enabling Livewire's pagination feature by using the `WithPagination`
| trait, Livewire will use Tailwind templates to render pagination views
| on the page. If you want Bootstrap CSS, you can specify: "bootstrap"
|
*/
'pagination_theme' => 'tailwind',
/*
|---------------------------------------------------------------------------
| Release Token
|---------------------------------------------------------------------------
|
| This token is stored client-side and sent along with each request to check
| a users session to see if a new release has invalidated it. If there is
| a mismatch it will throw an error and prompt for a browser refresh.
|
*/
'release_token' => 'a',
/*
|---------------------------------------------------------------------------
| CSP Safe
|---------------------------------------------------------------------------
|
| This config is used to determine if Livewire will use the CSP-safe version
| of Alpine in its bundle. This is useful for applications that are using
| strict Content Security Policy (CSP) to protect against XSS attacks.
|
*/
'csp_safe' => false,
/*
|---------------------------------------------------------------------------
| Payload Guards
|---------------------------------------------------------------------------
|
| These settings protect against malicious or oversized payloads that could
| cause denial of service. The default values should feel reasonable for
| most web applications. Each can be set to null to disable the limit.
|
*/
'payload' => [
'max_size' => 1024 * 1024, // 1MB - maximum request payload size in bytes
'max_nesting_depth' => 10, // Maximum depth of dot-notation property paths
'max_calls' => 50, // Maximum method calls per request
'max_components' => 20, // Maximum components per batch request
],
];

View file

@ -14,6 +14,12 @@
| |
*/ */
'ntfy' => [
'url' => env('NTFY_URL') ?: null,
'topic' => env('NTFY_TOPIC') ?: null,
'threshold' => env('NTFY_THRESHOLD') !== null ? (int) env('NTFY_THRESHOLD') : null,
],
'postmark' => [ 'postmark' => [
'key' => env('POSTMARK_API_KEY'), 'key' => env('POSTMARK_API_KEY'),
], ],

View file

@ -0,0 +1,53 @@
<?php
declare(strict_types=1);
namespace Database\Factories;
use App\Enums\CrawlOutcomeEnum;
use App\Models\Page;
use App\Models\PageCrawl;
use Illuminate\Database\Eloquent\Factories\Factory;
/**
* @extends Factory<PageCrawl>
*/
class PageCrawlFactory extends Factory
{
public function definition(): array
{
return [
'page_id' => null,
'domain' => 'example.com',
'priority' => 0,
'completed_at' => null,
'outcome' => null,
'status_code' => null,
'error_message' => null,
];
}
public function page(Page $page): static
{
return $this->state(fn () => [
'page_id' => $page->id,
]);
}
public function successful(): static
{
return $this->state(fn () => [
'outcome' => CrawlOutcomeEnum::Success,
'completed_at' => now(),
]);
}
public function failed(string $errorMessage): static
{
return $this->state(fn () => [
'outcome' => CrawlOutcomeEnum::Failed,
'completed_at' => now(),
'error_message' => $errorMessage,
]);
}
}

View file

@ -0,0 +1,26 @@
<?php
declare(strict_types=1);
namespace Database\Factories;
use App\Enums\PageStatusEnum;
use App\Models\Page;
use Illuminate\Database\Eloquent\Factories\Factory;
/**
* @extends Factory<Page>
*/
class PageFactory extends Factory
{
/**
* @return array<string, mixed>
*/
public function definition(): array
{
return [
'url' => fake()->url(),
'status' => PageStatusEnum::Discovered,
];
}
}

View file

@ -0,0 +1,34 @@
<?php
declare(strict_types=1);
namespace Database\Factories;
use App\Models\Page;
use App\Models\PageLink;
use Illuminate\Database\Eloquent\Factories\Factory;
/**
* @extends Factory<PageLink>
*/
class PageLinkFactory extends Factory
{
public function definition(): array
{
return [];
}
public function withSource(Page $page): static
{
return $this->state(fn () => [
'source_page_id' => $page->id,
]);
}
public function withTarget(Page $page): static
{
return $this->state(fn () => [
'target_page_id' => $page->id,
]);
}
}

View file

@ -0,0 +1,36 @@
<?php
declare(strict_types=1);
use App\Enums\PageStatusEnum;
use Illuminate\Database\Migrations\Migration;
use Illuminate\Database\Schema\Blueprint;
use Illuminate\Support\Facades\Schema;
return new class extends Migration
{
public function up(): void
{
Schema::create('pages', function (Blueprint $table) {
$table->id();
$table->text('url')->unique();
$table->string('status')->default(PageStatusEnum::Discovered->value)->index();
$table->string('language', 35)->nullable()->index();
$table->decimal('language_confidence', 4, 3)->nullable();
$table->string('title')->nullable();
$table->foreignId('instance_id')
->nullable()
->constrained('fedi_discover_instances')
->nullOnDelete();
$table->timestampTz('posted_at')->nullable();
$table->timestampTz('fetched_at')->nullable();
$table->timestampTz('failed_at')->nullable();
$table->timestampsTz();
});
}
public function down(): void
{
Schema::dropIfExists('pages');
}
};

View file

@ -0,0 +1,27 @@
<?php
declare(strict_types=1);
use Illuminate\Database\Migrations\Migration;
use Illuminate\Database\Schema\Blueprint;
use Illuminate\Support\Facades\Schema;
return new class extends Migration
{
public function up(): void
{
Schema::create('page_links', function (Blueprint $table) {
$table->id();
$table->foreignId('source_page_id')->constrained('pages');
$table->foreignId('target_page_id')->constrained('pages');
$table->timestampsTz();
$table->unique(['source_page_id', 'target_page_id']);
});
}
public function down(): void
{
Schema::dropIfExists('page_links');
}
};

View file

@ -0,0 +1,34 @@
<?php
declare(strict_types=1);
use Illuminate\Database\Migrations\Migration;
use Illuminate\Database\Schema\Blueprint;
use Illuminate\Support\Facades\Schema;
return new class extends Migration
{
public function up(): void
{
Schema::create('page_crawls', function (Blueprint $table) {
$table->id();
$table->foreignId('page_id')
->constrained('pages')
->cascadeOnDelete();
$table->string('domain');
$table->smallInteger('priority')->default(0);
$table->timestampTz('completed_at')->nullable();
$table->string('outcome')->nullable();
$table->smallInteger('status_code')->nullable();
$table->text('error_message')->nullable();
$table->timestampsTz();
$table->index(['page_id', 'created_at']);
});
}
public function down(): void
{
Schema::dropIfExists('page_crawls');
}
};

128
docker/prod/Dockerfile Normal file
View file

@ -0,0 +1,128 @@
# syntax=docker/dockerfile:1
# ============================================================
# Stage 1: Build frontend assets
# ============================================================
FROM node:20-alpine AS frontend
WORKDIR /app
COPY package.json package-lock.json vite.config.js ./
COPY resources/ resources/
RUN npm ci --no-audit --no-fund
RUN npm run build
# ============================================================
# Stage 2: Runtime (FrankenPHP)
# ============================================================
FROM dunglas/frankenphp:1.1-php8.3-alpine AS runtime
RUN apk add --no-cache \
git \
postgresql-client \
curl
RUN install-php-extensions \
pdo_pgsql \
redis \
opcache \
zip \
gd \
intl
COPY --from=composer:2 /usr/bin/composer /usr/bin/composer
WORKDIR /app
ENV APP_ENV=production \
APP_DEBUG=false \
LOG_CHANNEL=stack \
LOG_LEVEL=warning \
DB_CONNECTION=pgsql \
DB_HOST=db \
DB_PORT=5432 \
REDIS_HOST=redis \
REDIS_PORT=6379 \
CACHE_STORE=redis \
QUEUE_CONNECTION=redis \
SESSION_DRIVER=redis \
BROADCAST_CONNECTION=log \
MAIL_MAILER=log
# Copy only the files composer needs before install, so the composer layer stays
# cached when application source changes. packages/ is required because composer.json
# declares it as a path repository.
COPY composer.json composer.lock ./
COPY packages/ packages/
# Skip post-autoload scripts (package:discover) during build — they need a runtime
# Laravel boot which fails without proper env. Discovery happens at runtime via
# start-prod.sh. --classmap-authoritative implies --optimize-autoloader.
RUN composer install --no-dev --no-interaction --prefer-dist --classmap-authoritative --no-scripts
COPY . .
COPY --from=frontend /app/public/build /app/public/build
RUN chown -R www-data:www-data /app/storage /app/bootstrap/cache
RUN cat > /etc/caddy/Caddyfile <<'EOF'
{
frankenphp
order php_server before file_server
}
:8000 {
root * /app/public
php_server {
index index.php
}
encode gzip zstd
file_server
header {
X-Frame-Options "SAMEORIGIN"
X-Content-Type-Options "nosniff"
Referrer-Policy "strict-origin-when-cross-origin"
}
}
EOF
EXPOSE 8000
HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \
CMD curl -fsS http://localhost:8000/up || exit 1
RUN cat > /start-prod.sh <<'EOF'
#!/bin/sh
set -e
echo "Waiting for PostgreSQL at ${DB_HOST}:${DB_PORT}..."
for i in $(seq 1 60); do
if pg_isready -h "${DB_HOST}" -p "${DB_PORT}" -q; then
echo "PostgreSQL is ready."
break
fi
if [ "$i" = "60" ]; then
echo "Timed out waiting for PostgreSQL after 60s." >&2
exit 1
fi
sleep 1
done
php artisan package:discover --ansi
php artisan config:cache
php artisan route:cache
php artisan view:cache
php artisan migrate --force
exec frankenphp run --config /etc/caddy/Caddyfile
EOF
RUN chmod +x /start-prod.sh
CMD ["/start-prod.sh"]

View file

@ -3,5 +3,20 @@
declare(strict_types=1); declare(strict_types=1);
return [ return [
// Instance list, polling intervals, and HTTP client config land here. 'http' => [
'timeout' => 10,
// Default points at the project site so fediverse admins can always trace a Trove poller
// back to the project. Operators running their own deployment should override this via
// `php artisan vendor:publish --tag=fedi-discover-config` with their own contact URL.
'user_agent' => 'Trove/1.0 (+https://trove.lvl0.xyz)',
'max_redirects' => 3,
],
'defaults' => [
// Minimum recommended: 60. Mastodon/Lemmy rate limits apply per-instance.
'interval_seconds' => 300,
],
// Instances are DB-managed (table: fedi_discover_instances).
// See the Instance model + admin UI (TBD). No instance list here.
]; ];

View file

@ -0,0 +1,34 @@
<?php
declare(strict_types=1);
use Illuminate\Database\Migrations\Migration;
use Illuminate\Database\Schema\Blueprint;
use Illuminate\Support\Facades\Schema;
return new class extends Migration
{
public function up(): void
{
Schema::create('fedi_discover_instances', function (Blueprint $table) {
$table->id();
$table->string('type');
// Instance origin, e.g. https://mastodon.social. Not a full endpoint path.
$table->string('url');
$table->boolean('enabled')->default(true);
$table->unsignedInteger('interval_seconds')->default(300);
$table->json('extras')->default('{}');
$table->unsignedInteger('consecutive_poll_failures')->default(0);
$table->timestampTz('last_polled_at')->nullable();
$table->string('last_seen_id')->nullable();
$table->timestamps();
$table->unique(['type', 'url']);
});
}
public function down(): void
{
Schema::dropIfExists('fedi_discover_instances');
}
};

View file

@ -0,0 +1,83 @@
<?php
declare(strict_types=1);
namespace Lvl0\FediDiscover\Actions;
use Carbon\CarbonImmutable;
use Illuminate\Support\Facades\Log;
use Lvl0\FediDiscover\Clients\FediverseClientFactory;
use Lvl0\FediDiscover\Events\UrlDiscovered;
use Lvl0\FediDiscover\Models\Instance;
use Lvl0\FediDiscover\ValueObjects\FediversePost;
use Throwable;
class PollFediverseAction
{
public function __construct(private FediverseClientFactory $factory) {}
public function execute(Instance $instance): void
{
$start = microtime(true);
$client = $this->factory->for($instance);
$posts = $client->fetchPostsSince($instance, $instance->last_seen_id);
$urlCount = $posts
->map(function (FediversePost $post) use ($instance) {
try {
return $this->processLinks($post, $instance);
} catch (Throwable $e) {
Log::warning('fedi-discover:processLinks failed', [
'instance_id' => $instance->id,
'instance_url' => $instance->url,
'post_url' => $post->selfUrl,
'exception' => $e::class,
'message' => $e->getMessage(),
]);
}
})
->sum();
if ($posts->isNotEmpty()) {
$instance->last_seen_id = $posts->first()->cursorId;
}
$instance->consecutive_poll_failures = 0;
$instance->last_polled_at = now();
$instance->save();
Log::info('fedi-discover:poll succeeded', [
'instance_id' => $instance->id,
'url_count' => $urlCount,
'duration_ms' => (int) round((microtime(true) - $start) * 1000),
]);
}
private function processLinks(FediversePost $post, Instance $instance): int
{
if ($post->body === null) {
return 0;
}
$linksFound = preg_match_all('~https?://[^\s<>"\'()\[\]]+~', $post->body, $matches);
if ($linksFound === 0) {
return 0;
}
return collect($matches[0])
->map(fn (string $u) => rtrim($u, '.,;:!?'))
->filter(fn (string $u) => filter_var($u, FILTER_VALIDATE_URL) !== false)
->filter(fn (string $u) => parse_url($u, PHP_URL_HOST) !== parse_url($instance->url, PHP_URL_HOST))
->unique()
->each(fn (string $url) => UrlDiscovered::dispatch(
url: $url,
instanceId: $instance->id,
discoveredAt: CarbonImmutable::now(),
postUrl: $post->selfUrl,
postBody: $post->body,
))
->count();
}
}

View file

@ -0,0 +1,24 @@
<?php
declare(strict_types=1);
namespace Lvl0\FediDiscover\Clients;
use Lvl0\FediDiscover\Config\InstanceType;
use Lvl0\FediDiscover\Models\Instance;
class FediverseClientFactory
{
public function __construct(
private MastodonClient $mastodonClient,
private LemmyClient $lemmyClient,
) {}
public function for(Instance $instance): FediverseClientInterface
{
return match ($instance->type) {
InstanceType::Mastodon => $this->mastodonClient,
InstanceType::Lemmy => $this->lemmyClient,
};
}
}

View file

@ -0,0 +1,22 @@
<?php
declare(strict_types=1);
namespace Lvl0\FediDiscover\Clients;
use Illuminate\Support\Collection;
use Lvl0\FediDiscover\Models\Instance;
use Lvl0\FediDiscover\ValueObjects\FediversePost;
interface FediverseClientInterface
{
/**
* Fetch posts newer than the given cursor.
*
* MUST return posts in newest-first order. Callers treat the
* first item as the new high-water mark.
*
* @return Collection<int, FediversePost>
*/
public function fetchPostsSince(Instance $instance, ?string $lastSeenId): Collection;
}

View file

@ -0,0 +1,43 @@
<?php
declare(strict_types=1);
namespace Lvl0\FediDiscover\Clients;
use Illuminate\Support\Collection;
use Illuminate\Support\Facades\Http;
use Lvl0\FediDiscover\Models\Instance;
use Lvl0\FediDiscover\ValueObjects\FediversePost;
class LemmyClient implements FediverseClientInterface
{
public function fetchPostsSince(Instance $instance, ?string $lastSeenId): Collection
{
$url = 'https://' . parse_url($instance->url, PHP_URL_HOST) . '/api/v3/post/list';
$params = $lastSeenId !== null ? ['min_id' => $lastSeenId] : [];
$response = Http::withHeaders([
'User-Agent' => config('fedi-discover.http.user_agent'),
])->timeout(config('fedi-discover.http.timeout'))->get($url, $params);
if (! $response->successful()) {
return collect();
}
return collect($response->json('posts', []))
->map(fn (array $p) => $p['post'])
->map(function (array $t) {
$parts = array_filter([$t['body'] ?? null, $t['url'] ?? null]);
$body = $parts ? implode(' ', $parts) : null;
return new FediversePost(
cursorId: (string) $t['id'],
selfUrl: $t['ap_id'],
body: $body,
title: $t['name'],
publishedAt: $t['published']
);
});
}
}

View file

@ -0,0 +1,36 @@
<?php
declare(strict_types=1);
namespace Lvl0\FediDiscover\Clients;
use Illuminate\Support\Collection;
use Illuminate\Support\Facades\Http;
use Lvl0\FediDiscover\Models\Instance;
use Lvl0\FediDiscover\ValueObjects\FediversePost;
class MastodonClient implements FediverseClientInterface
{
public function fetchPostsSince(Instance $instance, ?string $lastSeenId): Collection
{
$url = 'https://' . parse_url($instance->url, PHP_URL_HOST) . '/api/v1/timelines/public';
$params = $lastSeenId !== null ? ['min_id' => $lastSeenId] : [];
$response = Http::withHeaders([
'User-Agent' => config('fedi-discover.http.user_agent'),
])->timeout(config('fedi-discover.http.timeout'))->get($url, $params);
if (! $response->successful()) {
return collect();
}
return collect($response->json() ?? [])
->map(fn (array $t) => new FediversePost(
cursorId: $t['id'],
selfUrl: $t['url'] ?? $t['uri'] ?? null,
body: $t['content'],
publishedAt: $t['created_at'] ?? null
));
}
}

View file

@ -0,0 +1,65 @@
<?php
declare(strict_types=1);
namespace Lvl0\FediDiscover\Config;
use InvalidArgumentException;
final readonly class InstanceConfig
{
/**
* @param array<string, mixed> $extras
*/
public function __construct(
public InstanceType $type,
public string $url,
public bool $enabled,
public int $intervalSeconds,
public array $extras
) {}
/**
* @throws InvalidArgumentException
*/
public static function fromArray(array $array): self
{
foreach (['type', 'url', 'enabled', 'interval_seconds'] as $key) {
if (! array_key_exists($key, $array)) {
throw new InvalidArgumentException("Missing required key: {$key}");
}
}
if ($array['interval_seconds'] <= 0) {
throw new InvalidArgumentException('Interval seconds needs to be larger than zero');
}
$type = InstanceType::tryFrom($array['type']);
if ($type === null) {
throw new InvalidArgumentException('Invalid type: ' . $array['type']);
}
if (filter_var($array['url'], FILTER_VALIDATE_URL) === false) {
throw new InvalidArgumentException('Invalid URL: ' . $array['url']);
}
return new self(
type: $type,
url: $array['url'],
enabled: $array['enabled'],
intervalSeconds: $array['interval_seconds'],
extras: $array['extras'] ?? []
);
}
public function toArray(): array
{
return [
'type' => $this->type->value,
'url' => $this->url,
'enabled' => $this->enabled,
'interval_seconds' => $this->intervalSeconds,
'extras' => $this->extras,
];
}
}

View file

@ -0,0 +1,11 @@
<?php
declare(strict_types=1);
namespace Lvl0\FediDiscover\Config;
enum InstanceType: string
{
case Mastodon = 'mastodon';
case Lemmy = 'lemmy';
}

View file

@ -0,0 +1,61 @@
<?php
declare(strict_types=1);
namespace Lvl0\FediDiscover\Console\Commands;
use Illuminate\Console\Attributes\Description;
use Illuminate\Console\Attributes\Signature;
use Illuminate\Console\Command;
use Illuminate\Support\Facades\Log;
use Lvl0\FediDiscover\Actions\PollFediverseAction;
use Lvl0\FediDiscover\Events\PollFailed;
use Lvl0\FediDiscover\Models\Instance;
use Throwable;
#[Signature('fedi-discover:poll')]
#[Description('Poll all enabled fediverse instances for new URLs')]
class PollInstancesCommand extends Command
{
public function __construct(
private readonly PollFediverseAction $action
) {
parent::__construct();
}
public function handle(): int
{
$errors = Instance::enabled()
->get()
->map(function (Instance $instance) {
try {
$this->action->execute($instance);
return ['instance_id' => $instance->id, 'status' => 'success'];
} catch (Throwable $e) {
$this->error("Failed to poll {$instance->url}: {$e->getMessage()}");
Log::warning('fedi-discover:poll failed', [
'instance_id' => $instance->id,
'instance_url' => $instance->url,
'exception' => $e::class,
'message' => $e->getMessage(),
]);
return ['instance' => $instance, 'status' => 'error', 'error' => $e->getMessage()];
}
})
->filter(fn (array $res) => $res['status'] === 'error');
if ($errors->isEmpty()) {
return self::SUCCESS;
}
$errors->each(fn (array $errorArr) => PollFailed::dispatch(
$errorArr['instance'],
$errorArr['error'],
now()->toImmutable(),
));
return self::FAILURE;
}
}

View file

@ -0,0 +1,64 @@
<?php
declare(strict_types=1);
namespace Lvl0\FediDiscover\Console\Commands;
use Illuminate\Console\Attributes\Description;
use Illuminate\Console\Attributes\Signature;
use Illuminate\Console\Command;
use Lvl0\FediDiscover\Models\Instance;
#[Signature('fedi-discover:validate {--enabled-only}')]
#[Description('Validate saved instances')]
class ValidateInstancesCommand extends Command
{
public function handle(): int
{
$instances = Instance::query();
if ($this->option('enabled-only')) {
$instances->enabled();
}
$instances = $instances->get();
$invalidInstances = collect();
$instances->each(function (Instance $instance) use ($invalidInstances) {
$reasons = collect();
if (filter_var($instance->url, FILTER_VALIDATE_URL) === false) {
$reasons->add('Invalid URL: ' . $instance->url);
}
if ($instance->interval_seconds < 1) {
$reasons->add('Invalid interval seconds: ' . $instance->interval_seconds);
}
if ($reasons->isNotEmpty()) {
$invalidInstances->add([
'instance' => $instance,
'reasons' => $reasons,
]);
}
});
$this->info((string) $instances->count());
$this->info(($instances->count() - $invalidInstances->count()) . ' valid');
$this->line($invalidInstances->count() . ' invalid');
if ($invalidInstances->isNotEmpty()) {
$invalidInstances->each(function (array $instanceArray) {
$instance = $instanceArray['instance'];
$reason = $instanceArray['reasons']->join(', ');
$this->warn($instance->id . ' - ' . $instance->url);
$this->line(' : ' . $reason);
});
return self::FAILURE;
}
return self::SUCCESS;
}
}

View file

@ -0,0 +1,54 @@
<?php
declare(strict_types=1);
namespace Lvl0\FediDiscover\Database\Factories;
use Illuminate\Database\Eloquent\Factories\Factory;
use Lvl0\FediDiscover\Config\InstanceType;
use Lvl0\FediDiscover\Models\Instance;
/**
* @extends Factory<Instance>
*/
class InstanceFactory extends Factory
{
protected $model = Instance::class;
/**
* @return array<string, mixed>
*/
public function definition(): array
{
return [
'type' => null,
'url' => fake()->url,
'enabled' => null,
'interval_seconds' => 600,
'extras' => [],
'last_seen_id' => null,
'last_polled_at' => now(),
];
}
public function type(InstanceType $type): self
{
return $this->state(fn () => [
'type' => $type->value,
]);
}
public function enabled(): self
{
return $this->state(fn () => [
'enabled' => true,
]);
}
public function disabled(): self
{
return $this->state(fn () => [
'enabled' => false,
]);
}
}

View file

@ -0,0 +1,23 @@
<?php
declare(strict_types=1);
namespace Lvl0\FediDiscover\Events;
use Carbon\CarbonImmutable;
use Illuminate\Foundation\Events\Dispatchable;
use Illuminate\Queue\SerializesModels;
use Lvl0\FediDiscover\Models\Instance;
class PollFailed
{
use Dispatchable, SerializesModels;
public function __construct(
public Instance $instance,
public string $message,
public CarbonImmutable $failedAt,
) {
//
}
}

View file

@ -0,0 +1,22 @@
<?php
declare(strict_types=1);
namespace Lvl0\FediDiscover\Events;
use Carbon\CarbonImmutable;
use Illuminate\Foundation\Events\Dispatchable;
use Illuminate\Queue\SerializesModels;
class UrlDiscovered
{
use Dispatchable, SerializesModels;
public function __construct(
public string $url,
public int $instanceId,
public CarbonImmutable $discoveredAt,
public ?string $postUrl = null,
public ?string $postBody = null,
) {}
}

View file

@ -5,20 +5,32 @@
namespace Lvl0\FediDiscover; namespace Lvl0\FediDiscover;
use Illuminate\Support\ServiceProvider; use Illuminate\Support\ServiceProvider;
use Lvl0\FediDiscover\Clients\FediverseClientFactory;
use Lvl0\FediDiscover\Console\Commands\PollInstancesCommand;
use Lvl0\FediDiscover\Console\Commands\ValidateInstancesCommand;
class FediDiscoverServiceProvider extends ServiceProvider class FediDiscoverServiceProvider extends ServiceProvider
{ {
public function register(): void public function register(): void
{ {
$this->mergeConfigFrom(__DIR__ . '/../config/fedi-discover.php', 'fedi-discover'); $this->mergeConfigFrom(__DIR__ . '/../config/fedi-discover.php', 'fedi-discover');
$this->app->singleton(FediverseClientFactory::class);
} }
public function boot(): void public function boot(): void
{ {
$this->loadMigrationsFrom(__DIR__ . '/../database/migrations');
if ($this->app->runningInConsole()) { if ($this->app->runningInConsole()) {
$this->publishes([ $this->publishes([
__DIR__ . '/../config/fedi-discover.php' => config_path('fedi-discover.php'), __DIR__ . '/../config/fedi-discover.php' => config_path('fedi-discover.php'),
], 'fedi-discover-config'); ], 'fedi-discover-config');
$this->commands([
PollInstancesCommand::class,
ValidateInstancesCommand::class,
]);
} }
} }
} }

View file

@ -0,0 +1,64 @@
<?php
declare(strict_types=1);
namespace Lvl0\FediDiscover\Models;
use App\Models\Page;
use Illuminate\Database\Eloquent\Builder;
use Illuminate\Database\Eloquent\Factories\Factory;
use Illuminate\Database\Eloquent\Factories\HasFactory;
use Illuminate\Database\Eloquent\Model;
use Illuminate\Database\Eloquent\Relations\HasMany;
use Illuminate\Support\Carbon;
use Lvl0\FediDiscover\Config\InstanceType;
use Lvl0\FediDiscover\Database\Factories\InstanceFactory;
/**
* @property int $id
* @property InstanceType $type
* @property string $url
* @property bool $enabled
* @property int $interval_seconds
* @property array<string, mixed> $extras
* @property string|null $last_seen_id
* @property int $consecutive_poll_failures
* @property Carbon|null $last_polled_at
* @property Carbon $created_at
* @property Carbon $updated_at
*/
class Instance extends Model
{
/** @use HasFactory<InstanceFactory> */
use HasFactory;
protected $table = 'fedi_discover_instances';
protected $fillable = ['type', 'url', 'enabled', 'interval_seconds', 'extras', 'last_seen_id', 'last_polled_at', 'consecutive_poll_failures'];
protected $casts = [
'type' => InstanceType::class,
'enabled' => 'boolean',
'extras' => 'array',
'last_polled_at' => 'datetime',
];
/**
* @param Builder<self> $query
* @return Builder<self>
*/
public function scopeEnabled(Builder $query): Builder
{
return $query->where('enabled', true);
}
protected static function newFactory(): Factory
{
return InstanceFactory::new();
}
public function pages(): HasMany
{
return $this->hasMany(Page::class);
}
}

View file

@ -0,0 +1,16 @@
<?php
declare(strict_types=1);
namespace Lvl0\FediDiscover\ValueObjects;
class FediversePost
{
public function __construct(
public string $cursorId,
public ?string $selfUrl,
public ?string $body = null,
public ?string $title = null,
public ?string $publishedAt = null,
) {}
}

View file

@ -0,0 +1,45 @@
<?php
declare(strict_types=1);
namespace Lvl0\FediDiscover\Tests\Feature;
use Lvl0\FediDiscover\Clients\FediverseClientFactory;
use Lvl0\FediDiscover\Clients\LemmyClient;
use Lvl0\FediDiscover\Clients\MastodonClient;
use Lvl0\FediDiscover\Config\InstanceType;
use Lvl0\FediDiscover\Models\Instance;
use Tests\TestCase;
class FediverseClientFactoryTest extends TestCase
{
public function test_it_resolves_mastodon_client_for_mastodon_instance_type(): void
{
$factory = app(FediverseClientFactory::class);
$instance = new Instance(['type' => InstanceType::Mastodon, 'url' => 'https://mastodon.social']);
$client = $factory->for($instance);
$this->assertInstanceOf(MastodonClient::class, $client);
}
public function test_it_resolves_lemmy_client_for_lemmy_instance_type(): void
{
$factory = app(FediverseClientFactory::class);
$instance = new Instance(['type' => InstanceType::Lemmy, 'url' => 'https://lemmy.world']);
$client = $factory->for($instance);
$this->assertInstanceOf(LemmyClient::class, $client);
}
public function test_it_is_registered_as_a_singleton_in_the_container(): void
{
$a = $this->app->make(FediverseClientFactory::class);
$b = $this->app->make(FediverseClientFactory::class);
$this->assertSame($a, $b);
}
}

View file

@ -0,0 +1,57 @@
<?php
declare(strict_types=1);
namespace Lvl0\FediDiscover\Tests\Feature;
use Illuminate\Foundation\Testing\RefreshDatabase;
use Lvl0\FediDiscover\Config\InstanceConfig;
use Lvl0\FediDiscover\Config\InstanceType;
use Lvl0\FediDiscover\Models\Instance;
use Tests\TestCase;
class InstanceConfigPersistenceTest extends TestCase
{
use RefreshDatabase;
public function test_instance_config_to_array_is_mass_assignable_on_the_model(): void
{
$config = InstanceConfig::fromArray([
'type' => InstanceType::Mastodon->value,
'url' => 'https://mastodon.social',
'enabled' => true,
'interval_seconds' => 600,
'extras' => ['token' => 'abc123'],
]);
Instance::create($config->toArray());
$this->artisan('fedi-discover:validate')
->assertExitCode(0);
}
public function test_an_instance_config_survives_a_write_read_cycle_through_the_model(): void
{
$original = InstanceConfig::fromArray([
'type' => InstanceType::Mastodon->value,
'url' => 'https://hachyderm.io',
'enabled' => false,
'interval_seconds' => 900,
'extras' => ['foo' => 'bar'],
]);
Instance::create($original->toArray());
$instance = Instance::query()->firstOrFail();
$roundTripped = InstanceConfig::fromArray([
'type' => $instance->type->value,
'url' => $instance->url,
'enabled' => $instance->enabled,
'interval_seconds' => $instance->interval_seconds,
'extras' => $instance->extras,
]);
$this->assertEquals($original, $roundTripped);
}
}

View file

@ -0,0 +1,113 @@
<?php
declare(strict_types=1);
namespace Lvl0\FediDiscover\Tests\Feature;
use Illuminate\Foundation\Testing\RefreshDatabase;
use Illuminate\Support\Carbon;
use Lvl0\FediDiscover\Config\InstanceType;
use Lvl0\FediDiscover\Models\Instance;
use Tests\TestCase;
class InstanceModelTest extends TestCase
{
use RefreshDatabase;
public function test_it_persists_and_retrieves_an_instance(): void
{
Instance::create([
'type' => InstanceType::Mastodon,
'url' => 'https://mastodon.social',
'enabled' => true,
'interval_seconds' => 600,
'extras' => ['token' => 'abc123'],
]);
$instance = Instance::first();
$this->assertNotNull($instance);
$this->assertSame(InstanceType::Mastodon, $instance->type);
$this->assertSame('https://mastodon.social', $instance->url);
$this->assertTrue($instance->enabled);
$this->assertSame(600, $instance->interval_seconds);
$this->assertSame(['token' => 'abc123'], $instance->extras);
}
public function test_enabled_is_fillable_and_cast_to_boolean(): void
{
$instance = Instance::create([
'type' => InstanceType::Mastodon,
'url' => 'https://mastodon.social',
'enabled' => false,
'interval_seconds' => 600,
]);
$this->assertFalse($instance->fresh()->enabled);
}
public function test_last_polled_at_is_fillable_and_cast_to_datetime(): void
{
$polledAt = Carbon::parse('2026-04-23 12:00:00');
$instance = Instance::create([
'type' => InstanceType::Mastodon,
'url' => 'https://mastodon.social',
'enabled' => true,
'interval_seconds' => 600,
'last_polled_at' => $polledAt,
]);
$fresh = $instance->fresh();
$this->assertInstanceOf(Carbon::class, $fresh->last_polled_at);
$this->assertTrue($fresh->last_polled_at->equalTo($polledAt));
}
public function test_last_seen_id_defaults_to_null(): void
{
$instance = Instance::create([
'type' => InstanceType::Mastodon,
'url' => 'https://mastodon.social',
'enabled' => true,
'interval_seconds' => 600,
]);
$this->assertNull($instance->fresh()->last_seen_id);
}
public function test_last_seen_id_is_fillable_and_persists_as_string(): void
{
$instance = Instance::create([
'type' => InstanceType::Mastodon,
'url' => 'https://mastodon.social',
'enabled' => true,
'interval_seconds' => 600,
'last_seen_id' => '109876543210',
]);
$this->assertSame('109876543210', $instance->fresh()->last_seen_id);
}
public function test_enabled_scope_returns_only_enabled_instances(): void
{
Instance::create([
'type' => InstanceType::Mastodon,
'url' => 'https://enabled.example',
'enabled' => true,
'interval_seconds' => 600,
]);
Instance::create([
'type' => InstanceType::Mastodon,
'url' => 'https://disabled.example',
'enabled' => false,
'interval_seconds' => 600,
]);
$enabled = Instance::enabled()->get();
$this->assertCount(1, $enabled);
$this->assertSame('https://enabled.example', $enabled->first()->url);
}
}

View file

@ -0,0 +1,150 @@
<?php
declare(strict_types=1);
namespace Lvl0\FediDiscover\Tests\Feature;
use Illuminate\Support\Facades\Http;
use Lvl0\FediDiscover\Clients\LemmyClient;
use Lvl0\FediDiscover\Config\InstanceType;
use Lvl0\FediDiscover\Models\Instance;
use Lvl0\FediDiscover\ValueObjects\FediversePost;
use Tests\TestCase;
class LemmyClientTest extends TestCase
{
public function test_it_maps_each_post_to_a_fediverse_post(): void
{
Http::fake([
'*' => Http::response([
'posts' => [
$this->lemmyPost(
id: 42,
apId: 'https://lemmy.world/post/42',
name: 'My Great Post',
body: 'Some body text',
published: '2026-04-25T10:00:00.000000',
),
],
], 200),
]);
$posts = (new LemmyClient)->fetchPostsSince($this->lemmyInstance(), null);
$this->assertCount(1, $posts);
$this->assertInstanceOf(FediversePost::class, $posts->first());
$this->assertSame('42', $posts->first()->cursorId);
$this->assertSame('https://lemmy.world/post/42', $posts->first()->selfUrl);
$this->assertSame('My Great Post', $posts->first()->title);
$this->assertSame('Some body text', $posts->first()->body);
$this->assertSame('2026-04-25T10:00:00.000000', $posts->first()->publishedAt);
}
public function test_url_field_is_appended_to_body(): void
{
Http::fake([
'*' => Http::response([
'posts' => [
$this->lemmyPost(
id: 42,
apId: 'https://lemmy.world/post/42',
url: 'https://example-garden.blog/post-42',
body: 'Some original text.',
),
],
], 200),
]);
$post = (new LemmyClient)->fetchPostsSince($this->lemmyInstance(), null)->first();
$this->assertStringContainsString('Some original text.', $post->body);
$this->assertStringContainsString('https://example-garden.blog/post-42', $post->body);
}
public function test_url_absent_leaves_body_clean(): void
{
Http::fake([
'*' => Http::response([
'posts' => [
$this->lemmyPost(
id: 7,
apId: 'https://lemmy.world/post/7',
body: 'Just a regular post.',
),
],
], 200),
]);
$post = (new LemmyClient)->fetchPostsSince($this->lemmyInstance(), null)->first();
$this->assertSame('Just a regular post.', $post->body);
}
public function test_it_handles_posts_without_a_body_key(): void
{
Http::fake([
'*' => Http::response([
'posts' => [
[
'post' => [
'id' => 99,
'ap_id' => 'https://lemmy.world/post/99',
'url' => null,
'name' => 'Link-only post',
'published' => '2026-04-25T10:00:00.000000',
// 'body' key intentionally absent — real Lemmy API omits it for link-only posts
],
],
],
], 200),
]);
$post = (new LemmyClient)->fetchPostsSince($this->lemmyInstance(), null)->first();
$this->assertNull($post->body);
}
public function test_it_hits_the_post_list_endpoint_of_the_instance(): void
{
Http::fake([
'lemmy.world/api/v3/post/list*' => Http::response(['posts' => []], 200),
]);
(new LemmyClient)->fetchPostsSince($this->lemmyInstance(), null);
Http::assertSent(fn ($request) => str_starts_with($request->url(), 'https://lemmy.world/api/v3/post/list')
&& $request->method() === 'GET'
);
}
private function lemmyInstance(): Instance
{
return new Instance([
'type' => InstanceType::Lemmy,
'url' => 'https://lemmy.world',
]);
}
/**
* @return array<string, mixed>
*/
private function lemmyPost(
int $id,
string $apId,
?string $url = null,
string $body = '',
string $name = 'A post title',
string $published = '2026-04-25T10:00:00.000000',
): array {
return [
'post' => [
'id' => $id,
'ap_id' => $apId,
'url' => $url,
'body' => $body,
'name' => $name,
'published' => $published,
],
];
}
}

View file

@ -0,0 +1,191 @@
<?php
declare(strict_types=1);
namespace Lvl0\FediDiscover\Tests\Feature;
use Illuminate\Support\Collection;
use Illuminate\Support\Facades\Http;
use Lvl0\FediDiscover\Clients\MastodonClient;
use Lvl0\FediDiscover\Config\InstanceType;
use Lvl0\FediDiscover\Models\Instance;
use Lvl0\FediDiscover\ValueObjects\FediversePost;
use Tests\TestCase;
class MastodonClientTest extends TestCase
{
public function test_it_hits_the_public_timeline_endpoint_of_the_instance(): void
{
Http::fake([
'mastodon.social/api/v1/timelines/public*' => Http::response([], 200),
]);
(new MastodonClient)->fetchPostsSince($this->mastodonInstance(), null);
Http::assertSent(fn ($request) => str_starts_with($request->url(), 'https://mastodon.social/api/v1/timelines/public')
&& $request->method() === 'GET'
);
}
public function test_it_omits_min_id_on_first_poll(): void
{
Http::fake(['*' => Http::response([], 200)]);
(new MastodonClient)->fetchPostsSince($this->mastodonInstance(), null);
Http::assertSent(fn ($request) => ! str_contains($request->url(), 'min_id'));
}
public function test_it_passes_min_id_on_subsequent_polls(): void
{
Http::fake(['*' => Http::response([], 200)]);
(new MastodonClient)->fetchPostsSince($this->mastodonInstance(), '109876543210');
Http::assertSent(fn ($request) => str_contains($request->url(), 'min_id=109876543210'));
}
public function test_it_returns_an_empty_collection_when_the_api_returns_no_posts(): void
{
Http::fake(['*' => Http::response([], 200)]);
$posts = (new MastodonClient)->fetchPostsSince($this->mastodonInstance(), null);
$this->assertInstanceOf(Collection::class, $posts);
$this->assertTrue($posts->isEmpty());
}
public function test_it_maps_each_status_to_a_fediverse_post(): void
{
Http::fake([
'*' => Http::response([
$this->mastodonStatus(id: '109876543210', url: 'https://mastodon.social/@alice/109876543210', content: '<p>Hello</p>'),
$this->mastodonStatus(id: '109876543211', url: 'https://mastodon.social/@bob/109876543211', content: '<p>World</p>'),
], 200),
]);
$posts = (new MastodonClient)->fetchPostsSince($this->mastodonInstance(), null);
$this->assertCount(2, $posts);
$this->assertInstanceOf(FediversePost::class, $posts->first());
$this->assertSame('109876543210', $posts->first()->cursorId);
$this->assertSame('https://mastodon.social/@alice/109876543210', $posts->first()->selfUrl);
$this->assertSame('<p>Hello</p>', $posts->first()->body);
}
public function test_it_maps_published_at_from_created_at(): void
{
Http::fake([
'*' => Http::response([
$this->mastodonStatus(id: '109876543210', url: 'https://mastodon.social/@alice/109876543210'),
], 200),
]);
$posts = (new MastodonClient)->fetchPostsSince($this->mastodonInstance(), null);
$this->assertSame('2026-04-25T10:00:00Z', $posts->first()->publishedAt);
}
public function test_it_sets_title_to_null_for_mastodon_statuses(): void
{
Http::fake([
'*' => Http::response([
$this->mastodonStatus(id: '109876543210', url: 'https://mastodon.social/@alice/109876543210'),
], 200),
]);
$posts = (new MastodonClient)->fetchPostsSince($this->mastodonInstance(), null);
$this->assertNull($posts->first()->title);
}
public function test_it_falls_back_to_uri_when_url_is_null(): void
{
Http::fake([
'*' => Http::response([
$this->mastodonStatus(
id: '109876543210',
url: null,
uri: 'https://hachyderm.io/users/bob/statuses/5678',
content: '<p>federated post</p>'
),
], 200),
]);
$posts = (new MastodonClient)->fetchPostsSince($this->mastodonInstance(), null);
$this->assertSame('https://hachyderm.io/users/bob/statuses/5678', $posts->first()->selfUrl);
}
public function test_it_preserves_newest_first_ordering_from_the_api(): void
{
Http::fake([
'*' => Http::response([
$this->mastodonStatus(id: '300', url: 'https://mastodon.social/@a/300'),
$this->mastodonStatus(id: '200', url: 'https://mastodon.social/@b/200'),
$this->mastodonStatus(id: '100', url: 'https://mastodon.social/@c/100'),
], 200),
]);
$posts = (new MastodonClient)->fetchPostsSince($this->mastodonInstance(), null);
$this->assertSame(['300', '200', '100'], $posts->pluck('cursorId')->all());
}
public function test_it_returns_an_empty_collection_on_a_non_2xx_response(): void
{
Http::fake(['*' => Http::response('Too many requests', 429)]);
$posts = (new MastodonClient)->fetchPostsSince($this->mastodonInstance(), null);
$this->assertInstanceOf(Collection::class, $posts);
$this->assertTrue($posts->isEmpty());
}
public function test_it_returns_an_empty_collection_when_the_response_body_is_not_json(): void
{
Http::fake(['*' => Http::response('<html>error</html>', 200)]);
$posts = (new MastodonClient)->fetchPostsSince($this->mastodonInstance(), null);
$this->assertInstanceOf(Collection::class, $posts);
$this->assertTrue($posts->isEmpty());
}
public function test_it_sends_the_configured_user_agent(): void
{
Http::fake(['*' => Http::response([], 200)]);
(new MastodonClient)->fetchPostsSince($this->mastodonInstance(), null);
$expected = config('fedi-discover.http.user_agent');
Http::assertSent(fn ($request) => $request->header('User-Agent')[0] === $expected);
}
private function mastodonInstance(): Instance
{
return new Instance([
'type' => InstanceType::Mastodon,
'url' => 'https://mastodon.social',
]);
}
/**
* @return array<string, mixed>
*/
private function mastodonStatus(
string $id,
?string $url = null,
?string $uri = null,
string $content = '<p>example</p>',
): array {
return [
'id' => $id,
'url' => $url,
'uri' => $uri ?? "https://mastodon.social/users/x/statuses/{$id}",
'content' => $content,
'created_at' => '2026-04-25T10:00:00Z',
'account' => ['acct' => 'alice@mastodon.social'],
];
}
}

View file

@ -0,0 +1,268 @@
<?php
declare(strict_types=1);
namespace Lvl0\FediDiscover\Tests\Feature;
use Carbon\CarbonImmutable;
use Illuminate\Foundation\Testing\RefreshDatabase;
use Illuminate\Support\Facades\Event;
use Illuminate\Support\Facades\Log;
use Lvl0\FediDiscover\Actions\PollFediverseAction;
use Lvl0\FediDiscover\Clients\FediverseClientFactory;
use Lvl0\FediDiscover\Clients\FediverseClientInterface;
use Lvl0\FediDiscover\Config\InstanceType;
use Lvl0\FediDiscover\Events\UrlDiscovered;
use Lvl0\FediDiscover\Models\Instance;
use Lvl0\FediDiscover\ValueObjects\FediversePost;
use Mockery;
use Tests\TestCase;
class PollFediverseActionTest extends TestCase
{
use RefreshDatabase;
public function test_it_fires_one_event_per_extracted_url(): void
{
Event::fake([UrlDiscovered::class]);
$this->poll([
new FediversePost('1', 'https://mastodon.social/@alice/1', 'See https://example.com/one and https://other.example/two'),
]);
Event::assertDispatched(UrlDiscovered::class, fn (UrlDiscovered $e) => $e->url === 'https://example.com/one');
Event::assertDispatched(UrlDiscovered::class, fn (UrlDiscovered $e) => $e->url === 'https://other.example/two');
Event::assertDispatchedTimes(UrlDiscovered::class, 2);
}
public function test_it_extracts_urls_from_html_anchor_tags(): void
{
Event::fake([UrlDiscovered::class]);
$this->poll([
new FediversePost('1', 'https://mastodon.social/@alice/1', '<p>Check <a href="https://example.com/article">this</a>!</p>'),
]);
Event::assertDispatched(UrlDiscovered::class, fn (UrlDiscovered $e) => $e->url === 'https://example.com/article');
Event::assertDispatchedTimes(UrlDiscovered::class, 1);
}
public function test_it_extracts_urls_from_markdown_links(): void
{
Event::fake([UrlDiscovered::class]);
$this->poll(
posts: [new FediversePost('1', 'https://lemmy.world/post/42', 'A [great article](https://example.com/article) about trees.')],
instanceUrl: 'https://lemmy.world',
);
Event::assertDispatched(UrlDiscovered::class, fn (UrlDiscovered $e) => $e->url === 'https://example.com/article');
Event::assertDispatchedTimes(UrlDiscovered::class, 1);
}
public function test_it_strips_trailing_punctuation_from_urls(): void
{
Event::fake([UrlDiscovered::class]);
$this->poll([
new FediversePost('1', 'https://mastodon.social/@alice/1', 'Check https://example.com/article, it is great. Also https://other.example/page.'),
]);
Event::assertDispatched(UrlDiscovered::class, fn (UrlDiscovered $e) => $e->url === 'https://example.com/article');
Event::assertDispatched(UrlDiscovered::class, fn (UrlDiscovered $e) => $e->url === 'https://other.example/page');
}
public function test_it_deduplicates_urls_within_a_single_post(): void
{
Event::fake([UrlDiscovered::class]);
$this->poll([
new FediversePost('1', 'https://mastodon.social/@alice/1', 'Here is https://example.com/article and again https://example.com/article'),
]);
Event::assertDispatchedTimes(UrlDiscovered::class, 1);
}
public function test_it_filters_urls_on_the_polling_instance_host(): void
{
Event::fake([UrlDiscovered::class]);
$this->poll([
new FediversePost('1', 'https://mastodon.social/@alice/1', 'See https://mastodon.social/@bob/42 and https://example.com/article'),
]);
Event::assertDispatched(UrlDiscovered::class, fn (UrlDiscovered $e) => $e->url === 'https://example.com/article');
Event::assertDispatchedTimes(UrlDiscovered::class, 1);
}
public function test_it_ignores_posts_with_a_null_body(): void
{
Event::fake([UrlDiscovered::class]);
$this->poll([
new FediversePost('1', 'https://mastodon.social/@alice/1', null),
]);
Event::assertNotDispatched(UrlDiscovered::class);
}
public function test_it_ignores_non_http_schemes(): void
{
Event::fake([UrlDiscovered::class]);
$this->poll([
new FediversePost('1', 'https://mastodon.social/@alice/1', 'Email mailto:alice@example.com or try ftp://files.example.com/x'),
]);
Event::assertNotDispatched(UrlDiscovered::class);
}
public function test_it_passes_post_self_url_and_body_through_to_the_event(): void
{
Event::fake([UrlDiscovered::class]);
$instance = $this->makeInstance();
$body = 'Here is https://example.com/article with surrounding context.';
$this->pollInstance($instance, [
new FediversePost('1', 'https://mastodon.social/@alice/1', $body),
]);
Event::assertDispatched(UrlDiscovered::class, fn (UrlDiscovered $e) => $e->postUrl === 'https://mastodon.social/@alice/1'
&& $e->postBody === $body
&& $e->instanceId === $instance->id
&& $e->discoveredAt instanceof CarbonImmutable
);
}
public function test_it_processes_multiple_posts(): void
{
Event::fake([UrlDiscovered::class]);
$this->poll([
new FediversePost('1', 'https://mastodon.social/@alice/1', 'See https://example.com/one'),
new FediversePost('2', 'https://mastodon.social/@bob/2', 'Also https://example.com/two'),
]);
Event::assertDispatchedTimes(UrlDiscovered::class, 2);
}
public function test_it_updates_last_seen_id_to_the_first_posts_cursor(): void
{
$instance = $this->makeInstance();
// Clients return newest-first; the action treats posts[0]
// as the new high-water mark without inspecting cursor values.
$this->pollInstance($instance, [
new FediversePost('newest-cursor', 'https://mastodon.social/@alice/3', 'x'),
new FediversePost('middle-cursor', 'https://mastodon.social/@bob/2', 'y'),
new FediversePost('oldest-cursor', 'https://mastodon.social/@carol/1', 'z'),
]);
$this->assertSame('newest-cursor', $instance->fresh()->last_seen_id);
}
public function test_it_updates_last_polled_at(): void
{
$instance = $this->makeInstance();
$this->assertNull($instance->last_polled_at);
$this->pollInstance($instance, [
new FediversePost('1', 'https://mastodon.social/@alice/1', 'x'),
]);
$this->assertNotNull($instance->fresh()->last_polled_at);
}
public function test_it_passes_the_existing_last_seen_id_to_the_client(): void
{
$instance = $this->makeInstance(['last_seen_id' => '999']);
$client = Mockery::mock(FediverseClientInterface::class);
$client->shouldReceive('fetchPostsSince')
->once()
->with($instance, $instance->last_seen_id)
->andReturn(collect());
$factory = Mockery::mock(FediverseClientFactory::class);
$factory->shouldReceive('for')->with($instance)->andReturn($client);
(new PollFediverseAction($factory))->execute($instance);
}
public function test_it_leaves_last_seen_id_unchanged_when_no_posts_are_returned(): void
{
$instance = $this->makeInstance(['last_seen_id' => '500']);
$this->pollInstance($instance, []);
$this->assertSame('500', $instance->fresh()->last_seen_id);
}
public function test_consecutive_poll_failures_reset_to_zero_after_successful_poll(): void
{
$instance = $this->makeInstance(['consecutive_poll_failures' => 5]);
$this->pollInstance($instance, []);
$this->assertSame(0, $instance->fresh()->consecutive_poll_failures);
}
public function test_poll_logs_a_structured_success_entry_with_url_count_and_duration(): void
{
Log::spy();
Event::fake([UrlDiscovered::class]);
$instance = $this->makeInstance();
$this->pollInstance($instance, [
new FediversePost('1', 'https://mastodon.social/@alice/1', 'See https://example.com/one and https://other.example/two'),
new FediversePost('2', 'https://mastodon.social/@bob/2', 'Also https://example.com/three'),
]);
Log::shouldHaveReceived('info')
->once()
->withArgs(function (string $message, array $context) use ($instance): bool {
return $message === 'fedi-discover:poll succeeded'
&& $context['instance_id'] === $instance->id
&& $context['url_count'] === 3
&& isset($context['duration_ms'])
&& $context['duration_ms'] >= 0;
});
}
/**
* @param array<FediversePost> $posts
*/
private function poll(array $posts, string $instanceUrl = 'https://mastodon.social'): void
{
$this->pollInstance($this->makeInstance(['url' => $instanceUrl]), $posts);
}
/**
* @param array<FediversePost> $posts
*/
private function pollInstance(Instance $instance, array $posts): void
{
$client = Mockery::mock(FediverseClientInterface::class);
$client->shouldReceive('fetchPostsSince')->andReturn(collect($posts));
$factory = Mockery::mock(FediverseClientFactory::class);
$factory->shouldReceive('for')->andReturn($client);
(new PollFediverseAction($factory))->execute($instance);
}
/**
* @param array<string, mixed> $overrides
*/
private function makeInstance(array $overrides = []): Instance
{
return Instance::create(array_merge([
'type' => InstanceType::Mastodon,
'url' => 'https://mastodon.social',
'enabled' => true,
'interval_seconds' => 600,
], $overrides));
}
}

View file

@ -0,0 +1,202 @@
<?php
declare(strict_types=1);
namespace Lvl0\FediDiscover\Tests\Feature;
use Illuminate\Foundation\Testing\RefreshDatabase;
use Illuminate\Support\Facades\Event;
use Lvl0\FediDiscover\Actions\PollFediverseAction;
use Lvl0\FediDiscover\Clients\FediverseClientFactory;
use Lvl0\FediDiscover\Clients\FediverseClientInterface;
use Lvl0\FediDiscover\Config\InstanceType;
use Lvl0\FediDiscover\Events\PollFailed;
use Lvl0\FediDiscover\Models\Instance;
use Mockery;
use RuntimeException;
use Tests\TestCase;
class PollInstancesCommandTest extends TestCase
{
use RefreshDatabase;
protected function setUp(): void
{
parent::setUp();
// Bind a no-op factory stub so the command can resolve PollFediverseAction
// from the container without making real HTTP calls.
$clientStub = Mockery::mock(FediverseClientInterface::class);
$clientStub->shouldReceive('fetchPostsSince')->andReturn(collect());
$factoryStub = Mockery::mock(FediverseClientFactory::class);
$factoryStub->shouldReceive('for')->andReturn($clientStub);
$this->app->instance(FediverseClientFactory::class, $factoryStub);
}
public function test_it_exits_zero_when_there_are_no_enabled_instances(): void
{
$this->artisan('fedi-discover:poll')
->assertExitCode(0);
}
public function test_it_calls_the_action_for_each_enabled_instance_and_skips_disabled(): void
{
$enabled1 = Instance::create([
'type' => InstanceType::Mastodon,
'url' => 'https://mastodon.social',
'enabled' => true,
'interval_seconds' => 600,
]);
$enabled2 = Instance::create([
'type' => InstanceType::Mastodon,
'url' => 'https://fosstodon.org',
'enabled' => true,
'interval_seconds' => 600,
]);
Instance::create([
'type' => InstanceType::Mastodon,
'url' => 'https://disabled.example',
'enabled' => false,
'interval_seconds' => 600,
]);
$calledWith = [];
$action = Mockery::mock(PollFediverseAction::class);
$action->shouldReceive('execute')
->twice()
->withArgs(function (Instance $instance) use (&$calledWith): bool {
$calledWith[] = $instance->url;
return true;
});
$this->app->instance(PollFediverseAction::class, $action);
$this->artisan('fedi-discover:poll')->assertExitCode(0);
$this->assertEqualsCanonicalizing(
[$enabled1->url, $enabled2->url],
$calledWith,
);
}
public function test_one_instance_throwing_does_not_stop_remaining_instances_from_being_polled(): void
{
Instance::create([
'type' => InstanceType::Mastodon,
'url' => 'https://failing.example',
'enabled' => true,
'interval_seconds' => 600,
]);
$healthy = Instance::create([
'type' => InstanceType::Mastodon,
'url' => 'https://healthy.example',
'enabled' => true,
'interval_seconds' => 600,
]);
$calledWith = [];
$action = Mockery::mock(PollFediverseAction::class);
$action->shouldReceive('execute')
->twice()
->andReturnUsing(function (Instance $instance) use (&$calledWith): void {
$calledWith[] = $instance->url;
if ($instance->url === 'https://failing.example') {
throw new RuntimeException('Connection refused');
}
});
$this->app->instance(PollFediverseAction::class, $action);
$this->artisan('fedi-discover:poll')->assertExitCode(1);
$this->assertEqualsCanonicalizing(
['https://failing.example', $healthy->url],
$calledWith,
);
}
public function test_poll_failed_event_is_dispatched_when_action_throws(): void
{
Event::fake([PollFailed::class]);
$instance = Instance::create([
'type' => InstanceType::Mastodon,
'url' => 'https://failing.example',
'enabled' => true,
'interval_seconds' => 600,
]);
$action = Mockery::mock(PollFediverseAction::class);
$action->shouldReceive('execute')
->once()
->andReturnUsing(function (): void {
throw new RuntimeException('Connection refused');
});
$this->app->instance(PollFediverseAction::class, $action);
$this->artisan('fedi-discover:poll');
Event::assertDispatched(PollFailed::class, function (PollFailed $event) use ($instance): bool {
return $event->instance->id === $instance->id
&& $event->message === 'Connection refused';
});
}
public function test_poll_failed_event_is_not_dispatched_on_a_successful_poll(): void
{
Event::fake([PollFailed::class]);
Instance::create([
'type' => InstanceType::Mastodon,
'url' => 'https://healthy.example',
'enabled' => true,
'interval_seconds' => 600,
]);
// setUp() already binds a no-op action stub via the factory; no override needed.
$this->artisan('fedi-discover:poll');
Event::assertNotDispatched(PollFailed::class);
}
public function test_it_exits_one_when_at_least_one_instance_fails(): void
{
Instance::create([
'type' => InstanceType::Mastodon,
'url' => 'https://failing.example',
'enabled' => true,
'interval_seconds' => 600,
]);
Instance::create([
'type' => InstanceType::Mastodon,
'url' => 'https://healthy.example',
'enabled' => true,
'interval_seconds' => 600,
]);
$action = Mockery::mock(PollFediverseAction::class);
$action->shouldReceive('execute')
->twice()
->andReturnUsing(function (Instance $instance): void {
if ($instance->url === 'https://failing.example') {
throw new RuntimeException('Connection refused');
}
});
$this->app->instance(PollFediverseAction::class, $action);
$this->artisan('fedi-discover:poll')->assertExitCode(1);
}
}

View file

@ -0,0 +1,221 @@
<?php
declare(strict_types=1);
namespace Lvl0\FediDiscover\Tests\Feature;
use Illuminate\Foundation\Testing\RefreshDatabase;
use Lvl0\FediDiscover\Config\InstanceType;
use Lvl0\FediDiscover\Models\Instance;
use Tests\TestCase;
class ValidateInstancesCommandTest extends TestCase
{
use RefreshDatabase;
public function test_it_exits_zero_when_the_database_is_empty(): void
{
$this->artisan('fedi-discover:validate')
->assertExitCode(0);
}
public function test_it_exits_zero_when_all_instances_are_valid(): void
{
Instance::create([
'type' => InstanceType::Mastodon,
'url' => 'https://mastodon.social',
'enabled' => true,
'interval_seconds' => 600,
'extras' => [],
]);
$this->artisan('fedi-discover:validate')
->assertExitCode(0);
}
public function test_it_exits_nonzero_when_a_row_has_an_invalid_url(): void
{
Instance::create([
'type' => InstanceType::Mastodon,
'url' => 'not-a-url',
'enabled' => true,
'interval_seconds' => 600,
'extras' => [],
]);
$this->artisan('fedi-discover:validate')
->assertExitCode(1);
}
public function test_it_exits_nonzero_when_a_row_has_a_zero_interval(): void
{
Instance::create([
'type' => InstanceType::Mastodon,
'url' => 'https://mastodon.social',
'enabled' => true,
'interval_seconds' => 0,
'extras' => [],
]);
$this->artisan('fedi-discover:validate')
->assertExitCode(1);
}
public function test_it_reports_summary_of_valid_and_invalid_counts(): void
{
Instance::create([
'type' => InstanceType::Mastodon,
'url' => 'https://mastodon.social',
'enabled' => true,
'interval_seconds' => 600,
'extras' => [],
]);
Instance::create([
'type' => InstanceType::Mastodon,
'url' => 'https://hachyderm.io',
'enabled' => true,
'interval_seconds' => 600,
'extras' => [],
]);
Instance::create([
'type' => InstanceType::Mastodon,
'url' => 'bogus',
'enabled' => true,
'interval_seconds' => 600,
'extras' => [],
]);
$this->artisan('fedi-discover:validate')
->expectsOutputToContain('3')
->expectsOutputToContain('2 valid')
->expectsOutputToContain('1 invalid')
->assertExitCode(1);
}
public function test_it_does_not_fail_fast_and_reports_every_invalid_row(): void
{
Instance::create([
'type' => InstanceType::Mastodon,
'url' => 'bogus-one',
'enabled' => true,
'interval_seconds' => 600,
'extras' => [],
]);
$second = Instance::create([
'type' => InstanceType::Mastodon,
'url' => 'https://mastodon.social',
'enabled' => true,
'interval_seconds' => 0,
'extras' => [],
]);
$this->artisan('fedi-discover:validate')
->expectsOutputToContain('bogus-one')
->expectsOutputToContain((string) $second->id)
->assertExitCode(1);
}
public function test_it_includes_the_validation_error_message_for_each_invalid_row(): void
{
Instance::create([
'type' => InstanceType::Mastodon,
'url' => 'not-a-url',
'enabled' => true,
'interval_seconds' => 600,
'extras' => [],
]);
$this->artisan('fedi-discover:validate')
->expectsOutputToContain('Invalid URL: not-a-url')
->assertExitCode(1);
}
public function test_summary_counts_are_accurate_when_mixed(): void
{
// 2 valid
Instance::create([
'type' => InstanceType::Mastodon,
'url' => 'https://mastodon.social',
'enabled' => true,
'interval_seconds' => 600,
'extras' => [],
]);
Instance::create([
'type' => InstanceType::Mastodon,
'url' => 'https://hachyderm.io',
'enabled' => true,
'interval_seconds' => 600,
'extras' => [],
]);
// 3 invalid (different defects)
Instance::create([
'type' => InstanceType::Mastodon,
'url' => 'bogus-one',
'enabled' => true,
'interval_seconds' => 600,
'extras' => [],
]);
Instance::create([
'type' => InstanceType::Mastodon,
'url' => 'https://fosstodon.org',
'enabled' => true,
'interval_seconds' => 0,
'extras' => [],
]);
Instance::create([
'type' => InstanceType::Mastodon,
'url' => 'also-bad',
'enabled' => true,
'interval_seconds' => -5,
'extras' => [],
]);
$this->artisan('fedi-discover:validate')
->expectsOutputToContain('5')
->expectsOutputToContain('2 valid')
->expectsOutputToContain('3 invalid')
->assertExitCode(1);
}
public function test_it_exits_zero_with_enabled_only_when_no_enabled_instances_exist(): void
{
Instance::create([
'type' => InstanceType::Mastodon,
'url' => 'https://mastodon.social',
'enabled' => false,
'interval_seconds' => 600,
'extras' => [],
]);
$this->artisan('fedi-discover:validate', ['--enabled-only' => true])
->assertExitCode(0);
}
public function test_it_exits_zero_with_an_enabled_only_flag_when_disabled_rows_are_invalid(): void
{
// A disabled row that would fail InstanceConfig validation
Instance::create([
'type' => InstanceType::Mastodon,
'url' => 'broken-and-disabled',
'enabled' => false,
'interval_seconds' => 0,
'extras' => [],
]);
// A valid enabled row
Instance::create([
'type' => InstanceType::Mastodon,
'url' => 'https://mastodon.social',
'enabled' => true,
'interval_seconds' => 600,
'extras' => [],
]);
$this->artisan('fedi-discover:validate', ['--enabled-only' => true])
->assertExitCode(0);
}
}

View file

@ -0,0 +1,121 @@
<?php
declare(strict_types=1);
namespace Lvl0\FediDiscover\Tests\Unit;
use Lvl0\FediDiscover\Config\InstanceConfig;
use Lvl0\FediDiscover\Config\InstanceType;
use PHPUnit\Framework\Attributes\DataProvider;
use PHPUnit\Framework\TestCase;
class InstanceConfigTest extends TestCase
{
public function test_from_array_returns_instance_config_with_correct_field_values(): void
{
$config = InstanceConfig::fromArray([
'type' => 'mastodon',
'url' => 'https://mastodon.social',
'enabled' => true,
'interval_seconds' => 600,
'extras' => ['token' => 'abc123'],
]);
$this->assertSame(InstanceType::Mastodon, $config->type);
$this->assertSame('https://mastodon.social', $config->url);
$this->assertTrue($config->enabled);
$this->assertSame(600, $config->intervalSeconds);
$this->assertSame(['token' => 'abc123'], $config->extras);
}
public function test_from_array_rejects_non_positive_interval_seconds(): void
{
$this->expectException(\InvalidArgumentException::class);
InstanceConfig::fromArray([
'type' => 'mastodon',
'url' => 'https://mastodon.social',
'enabled' => true,
'interval_seconds' => 0,
'extras' => [],
]);
}
public function test_extras_defaults_to_empty_array_when_omitted(): void
{
$config = InstanceConfig::fromArray([
'type' => 'mastodon',
'url' => 'https://mastodon.social',
'enabled' => true,
'interval_seconds' => 600,
]);
$this->assertSame([], $config->extras);
}
#[DataProvider('requiredKeyProvider')]
public function test_from_array_throws_when_required_key_is_missing(string $missingKey): void
{
$input = [
'type' => 'mastodon',
'url' => 'https://mastodon.social',
'enabled' => true,
'interval_seconds' => 600,
];
unset($input[$missingKey]);
$this->expectException(\InvalidArgumentException::class);
$this->expectExceptionMessageMatches('/' . preg_quote($missingKey, '/') . '/');
InstanceConfig::fromArray($input);
}
public static function requiredKeyProvider(): array
{
return [
'type missing' => ['type'],
'url missing' => ['url'],
'enabled missing' => ['enabled'],
'interval_seconds missing' => ['interval_seconds'],
];
}
public function test_from_array_throws_invalid_argument_exception_for_unknown_type_string(): void
{
$this->expectException(\InvalidArgumentException::class);
$this->expectExceptionMessageMatches('/pleroma/');
InstanceConfig::fromArray([
'type' => 'pleroma',
'url' => 'https://pleroma.example.com',
'enabled' => true,
'interval_seconds' => 600,
]);
}
public function test_from_array_rejects_malformed_url(): void
{
$this->expectException(\InvalidArgumentException::class);
InstanceConfig::fromArray([
'type' => 'mastodon',
'url' => 'not a url',
'enabled' => true,
'interval_seconds' => 600,
]);
}
public function test_to_array_produces_array_that_round_trips_through_from_array(): void
{
$original = [
'type' => 'mastodon',
'url' => 'https://mastodon.social',
'enabled' => true,
'interval_seconds' => 600,
'extras' => ['token' => 'abc123'],
];
$this->assertSame($original, InstanceConfig::fromArray($original)->toArray());
}
}

View file

@ -0,0 +1,31 @@
<?php
declare(strict_types=1);
namespace Lvl0\FediDiscover\Tests\Unit;
use Carbon\CarbonImmutable;
use Lvl0\FediDiscover\Events\PollFailed;
use Lvl0\FediDiscover\Models\Instance;
use PHPUnit\Framework\TestCase;
class PollFailedTest extends TestCase
{
public function test_it_exposes_all_payload_fields(): void
{
$instance = new Instance;
$instance->id = 7;
$failedAt = CarbonImmutable::parse('2026-04-28T09:00:00');
$event = new PollFailed(
instance: $instance,
message: 'Connection timed out',
failedAt: $failedAt,
);
$this->assertSame($instance, $event->instance);
$this->assertSame('Connection timed out', $event->message);
$this->assertTrue($failedAt->eq($event->failedAt));
}
}

View file

@ -0,0 +1,44 @@
<?php
declare(strict_types=1);
namespace Lvl0\FediDiscover\Tests\Unit;
use Carbon\CarbonImmutable;
use Lvl0\FediDiscover\Events\UrlDiscovered;
use PHPUnit\Framework\TestCase;
class UrlDiscoveredTest extends TestCase
{
public function test_it_exposes_all_payload_fields(): void
{
$discoveredAt = CarbonImmutable::parse('2026-04-26T12:00:00');
$event = new UrlDiscovered(
url: 'https://example.com/article',
instanceId: 42,
discoveredAt: $discoveredAt,
postUrl: 'https://mastodon.social/@alice/109876543210',
postBody: 'Check out this article: https://example.com/article'
);
$this->assertSame('https://example.com/article', $event->url);
$this->assertSame(42, $event->instanceId);
$this->assertTrue($discoveredAt->eq($event->discoveredAt));
$this->assertSame('https://mastodon.social/@alice/109876543210', $event->postUrl);
$this->assertSame('Check out this article: https://example.com/article', $event->postBody);
}
public function test_post_body_is_nullable(): void
{
$event = new UrlDiscovered(
url: 'https://example.com/article',
instanceId: 1,
discoveredAt: CarbonImmutable::parse('2026-04-26T12:00:00'),
postUrl: 'https://mastodon.social/@alice/109876543210',
postBody: null
);
$this->assertNull($event->postBody);
}
}

View file

@ -3,6 +3,11 @@
xsi:noNamespaceSchemaLocation="vendor/phpunit/phpunit/phpunit.xsd" xsi:noNamespaceSchemaLocation="vendor/phpunit/phpunit/phpunit.xsd"
bootstrap="vendor/autoload.php" bootstrap="vendor/autoload.php"
colors="true" colors="true"
processIsolation="false"
displayDetailsOnPhpunitDeprecations="true"
displayDetailsOnTestsThatTriggerErrors="true"
displayDetailsOnTestsThatTriggerWarnings="true"
displayDetailsOnTestsThatTriggerNotices="true"
> >
<testsuites> <testsuites>
<testsuite name="Unit"> <testsuite name="Unit">
@ -22,19 +27,21 @@
</include> </include>
</source> </source>
<php> <php>
<env name="APP_ENV" value="testing"/> <server name="APP_ENV" value="testing"/>
<env name="APP_MAINTENANCE_DRIVER" value="file"/> <server name="APP_MAINTENANCE_DRIVER" value="file"/>
<env name="BCRYPT_ROUNDS" value="4"/> <server name="BCRYPT_ROUNDS" value="4"/>
<env name="BROADCAST_CONNECTION" value="null"/> <server name="BROADCAST_CONNECTION" value="null"/>
<env name="CACHE_STORE" value="array"/> <server name="CACHE_STORE" value="array"/>
<env name="DB_CONNECTION" value="sqlite"/> <server name="DB_CONNECTION" value="sqlite"/>
<env name="DB_DATABASE" value=":memory:"/> <server name="DB_DATABASE" value=":memory:"/>
<env name="DB_URL" value=""/> <server name="DB_URL" value=""/>
<env name="MAIL_MAILER" value="array"/> <server name="MAIL_MAILER" value="array"/>
<env name="QUEUE_CONNECTION" value="sync"/> <server name="QUEUE_CONNECTION" value="sync"/>
<env name="SESSION_DRIVER" value="array"/> <server name="SESSION_DRIVER" value="array"/>
<env name="PULSE_ENABLED" value="false"/> <server name="PULSE_ENABLED" value="false"/>
<env name="TELESCOPE_ENABLED" value="false"/> <server name="TELESCOPE_ENABLED" value="false"/>
<env name="NIGHTWATCH_ENABLED" value="false"/> <server name="NIGHTWATCH_ENABLED" value="false"/>
<ini name="display_errors" value="On"/>
<ini name="error_reporting" value="-1"/>
</php> </php>
</phpunit> </phpunit>

8
pint.json Normal file
View file

@ -0,0 +1,8 @@
{
"preset": "laravel",
"rules": {
"concat_space": {
"spacing": "one"
}
}
}

View file

@ -0,0 +1,29 @@
@extends('layouts.app')
@section('content')
<div>
<h1>Instances</h1>
<table>
<thead>
<tr>
<th>Instance</th>
<th>Last polled at</th>
<th>URLs</th>
<th>Errors</th>
</tr>
</thead>
<tbody>
@foreach($instances as $instance)
<tr>
<td>{{ $instance->url }}</td>
<td>{{ $instance->last_polled_at }}</td>
<td>{{ $instance->pages_count }} URLs</td>
<td>{{ $instance->failed_pages_count }} errors</td>
</tr>
@endforeach
</tbody>
</table>
</div>
@endsection

View file

@ -0,0 +1,63 @@
@extends('layouts.app')
@section('content')
<main>
<h1>About TroveBot</h1>
<p>
<strong>Trove</strong> is a federated search engine for the small web,
seeded by fediverse attention and ranked by domain coherence rather than
commercial authority. <strong>TroveBot</strong> is its crawler it
discovers and indexes URLs shared by people on the fediverse, then
follows the citations they make to find more of the small web.
</p>
<h2>Identity</h2>
<p>TroveBot identifies itself with the following User-Agent string:</p>
<pre><code>TroveBot/0.1 (+https://trove.lvl0.xyz/bot)</code></pre>
<h2>Crawling behavior</h2>
<ul>
<li>Respects <code>robots.txt</code> rules under <code>User-agent: TroveBot</code> (and the wildcard <code>User-agent: *</code> as a fallback).</li>
<li>Polite per-domain rate limit at most a few requests per minute per host.</li>
<li>Follows up to 5 redirects per URL.</li>
<li>Fetches HTML only. PDFs, images, and other binary content are recorded as discovered but never re-fetched.</li>
<li>Does not execute JavaScript, does not crawl behind authentication, does not crawl URLs containing user credentials.</li>
</ul>
<h2>Opt out</h2>
<p>
Block TroveBot entirely by adding the following to your site's
<code>robots.txt</code>:
</p>
<pre><code>User-agent: TroveBot
Disallow: /</code></pre>
<p>
Or block specific paths:
</p>
<pre><code>User-agent: TroveBot
Disallow: /private/
Disallow: /admin/</code></pre>
<h2>Contact &amp; source</h2>
<ul>
<li>
Issues, questions, abuse reports:
<a href="https://forge.lvl0.xyz/lvl0/trove/issues">forge.lvl0.xyz/lvl0/trove/issues</a>
</li>
<li>
Source code:
<a href="https://forge.lvl0.xyz/lvl0/trove">forge.lvl0.xyz/lvl0/trove</a>
</li>
</ul>
</main>
@endsection

View file

@ -0,0 +1,18 @@
<!DOCTYPE html>
<html lang="{{ str_replace('_', '-', app()->getLocale()) }}">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Trove @yield('title', config('app.name'))</title>
@vite(['resources/css/app.css', 'resources/js/app.js'])
@livewireStyles
</head>
<body>
@yield('content')
@livewireScripts
</body>
</html>

View file

@ -0,0 +1,14 @@
<div>
@error('rate_limit') <p>{{ $message }}</p> @enderror
@if ($confirmedUrl !== null)
<p>Thanks, we've received <strong>{{ $confirmedUrl }}</strong></p>
@else
<form wire:submit="submit">
<label for="url">URL</label>
<input id="url" type="url" wire:model="url" required>
@error('url') <p>{{ $message }}</p> @enderror
<button type="submit">Submit</button>
</form>
@endif
</div>

View file

@ -0,0 +1,7 @@
@extends('layouts.app')
@section('content')
<livewire:url-submission-form />
@endsection

File diff suppressed because one or more lines are too long

View file

@ -1,8 +1,8 @@
<?php <?php
use Illuminate\Foundation\Inspiring; use Illuminate\Support\Facades\Schedule;
use Illuminate\Support\Facades\Artisan;
Artisan::command('inspire', function () { Schedule::command('fedi-discover:poll')
$this->comment(Inspiring::quote()); ->everyMinute()
})->purpose('Display an inspiring quote'); ->withoutOverlapping(5)
->runInBackground();

View file

@ -1,7 +1,16 @@
<?php <?php
declare(strict_types=1);
use App\Http\Controllers\Admin\InstancesController;
use Illuminate\Support\Facades\Route; use Illuminate\Support\Facades\Route;
Route::get('/', function () { Route::get('/', function () {
return view('welcome'); return view('welcome');
}); });
Route::view('/submit', 'urls.submit');
Route::view('/bot', 'bot');
Route::get('/admin/instances', [InstancesController::class, 'index'])->name('admin.instances');

View file

@ -92,6 +92,10 @@ pkgs.mkShell {
podman-compose -f $COMPOSE_FILE exec app php artisan "$@" podman-compose -f $COMPOSE_FILE exec app php artisan "$@"
} }
dev-composer() {
podman-compose -f $COMPOSE_FILE exec app composer "$@"
}
# =================== # ===================
# BUILD COMMANDS # BUILD COMMANDS
# =================== # ===================
@ -141,6 +145,7 @@ pkgs.mkShell {
echo " dev-logs-redis Tail Redis logs" echo " dev-logs-redis Tail Redis logs"
echo " dev-shell Shell into app container" echo " dev-shell Shell into app container"
echo " dev-artisan <cmd> Run artisan command" echo " dev-artisan <cmd> Run artisan command"
echo " dev-composer <cmd> Run composer command"
echo " base-build Build and push image" echo " base-build Build and push image"
echo "" echo ""
echo "Services:" echo "Services:"

View file

@ -0,0 +1,511 @@
<?php
declare(strict_types=1);
namespace Tests\Feature\Actions;
use App\Actions\FetchPageAction;
use App\Enums\CrawlOutcomeEnum;
use App\Services\LanguageDetectionService;
use App\ValueObjects\FetchResult;
use GuzzleHttp\Exception\ConnectException;
use GuzzleHttp\Psr7\Request;
use Illuminate\Support\Collection;
use Illuminate\Support\Facades\Http;
use Tests\TestCase;
class FetchPageActionTest extends TestCase
{
public function test_successful_html_fetch_returns_success_outcome(): void
{
Http::fake([
'example.com/*' => Http::response(
'<html><body>Hello</body></html>',
200,
['Content-Type' => 'text/html'],
),
]);
$result = $this->makeAction()('https://example.com/page');
$this->assertInstanceOf(FetchResult::class, $result);
$this->assertSame(CrawlOutcomeEnum::Success, $result->outcome);
$this->assertSame(200, $result->statusCode);
$this->assertNotNull($result->finalUrl);
}
public function test_4xx_response_returns_blocked_4xx(): void
{
Http::fake([
'example.com/*' => Http::response('Not Found', 404),
]);
$result = $this->makeAction()('https://example.com/missing');
$this->assertInstanceOf(FetchResult::class, $result);
$this->assertSame(CrawlOutcomeEnum::Blocked4xx, $result->outcome);
$this->assertSame(404, $result->statusCode);
$this->assertIsString($result->errorMessage);
$this->assertStringContainsString('404', $result->errorMessage);
$this->assertNotNull($result->finalUrl);
}
public function test_5xx_response_returns_blocked_5xx(): void
{
Http::fake([
'example.com/*' => Http::response('Service Unavailable', 503),
]);
$result = $this->makeAction()('https://example.com/page');
$this->assertInstanceOf(FetchResult::class, $result);
$this->assertSame(CrawlOutcomeEnum::Blocked5xx, $result->outcome);
$this->assertSame(503, $result->statusCode);
$this->assertIsString($result->errorMessage);
$this->assertStringContainsString('503', $result->errorMessage);
$this->assertNotNull($result->finalUrl);
}
public function test_non_html_content_type_returns_rejected(): void
{
Http::fake([
'example.com/*' => Http::response(
'PDF binary stuff',
200,
['Content-Type' => 'application/pdf'],
),
]);
$result = $this->makeAction()('https://example.com/document.pdf');
$this->assertInstanceOf(FetchResult::class, $result);
$this->assertSame(CrawlOutcomeEnum::Rejected, $result->outcome);
$this->assertSame(200, $result->statusCode);
$this->assertIsString($result->errorMessage);
$this->assertStringContainsString('application/pdf', $result->errorMessage);
$this->assertNotNull($result->finalUrl);
$this->assertNull($result->title);
$this->assertNull($result->extractedText);
$this->assertEmpty($result->outboundLinks);
$this->assertNull($result->wordCount);
}
public function test_text_html_with_charset_is_accepted(): void
{
Http::fake([
'example.com/*' => Http::response(
'<html><body>Hello charset world</body></html>',
200,
['Content-Type' => 'text/html; charset=utf-8'],
),
]);
$result = $this->makeAction()('https://example.com/page');
$this->assertInstanceOf(FetchResult::class, $result);
$this->assertSame(CrawlOutcomeEnum::Success, $result->outcome);
$this->assertSame(200, $result->statusCode);
}
public function test_connection_failure_returns_failed(): void
{
Http::fake(function () {
throw new ConnectException(
'Could not resolve host',
new Request('GET', 'https://example.com/page'),
null,
['errno' => 6],
);
});
$result = $this->makeAction()('https://example.com/page');
$this->assertInstanceOf(FetchResult::class, $result);
$this->assertSame(CrawlOutcomeEnum::Failed, $result->outcome);
$this->assertNull($result->statusCode);
$this->assertNull($result->finalUrl);
$this->assertIsString($result->errorMessage);
$this->assertNull($result->title);
$this->assertNull($result->extractedText);
$this->assertEmpty($result->outboundLinks);
$this->assertNull($result->wordCount);
}
public function test_timeout_returns_timeout(): void
{
Http::fake(function () {
throw new ConnectException(
'cURL error 28: Operation timed out',
new Request('GET', 'https://example.com/page'),
null,
['errno' => 28],
);
});
$result = $this->makeAction()('https://example.com/page');
$this->assertInstanceOf(FetchResult::class, $result);
$this->assertSame(CrawlOutcomeEnum::Timeout, $result->outcome);
$this->assertNull($result->statusCode);
$this->assertNull($result->finalUrl);
$this->assertIsString($result->errorMessage);
}
public function test_success_extracts_title_from_html(): void
{
Http::fake([
'example.com/*' => Http::response(
'<html><head><title>My Page Title</title></head><body><p>Some content.</p></body></html>',
200,
['Content-Type' => 'text/html'],
),
]);
$result = $this->makeAction()('https://example.com/page');
$this->assertSame(CrawlOutcomeEnum::Success, $result->outcome);
$this->assertSame('My Page Title', $result->title);
}
public function test_success_extracts_main_text(): void
{
$html = <<<'HTML'
<!DOCTYPE html>
<html>
<head><title>Article Title</title></head>
<body>
<nav>Navigation links</nav>
<article>
<h1>The Real Article</h1>
<p>This is the main article body that should be extracted by readability.</p>
<p>Multiple paragraphs prove the extractor works on the full content.</p>
</article>
<footer>Site footer noise</footer>
</body>
</html>
HTML;
Http::fake([
'example.com/*' => Http::response($html, 200, ['Content-Type' => 'text/html']),
]);
$result = $this->makeAction()('https://example.com/article');
$this->assertSame(CrawlOutcomeEnum::Success, $result->outcome);
$this->assertNotNull($result->extractedText);
$this->assertStringContainsString('main article body', $result->extractedText);
}
public function test_success_extracts_and_filters_outbound_links(): void
{
$html = <<<'HTML'
<!DOCTYPE html>
<html>
<head><title>Article With Links</title></head>
<body>
<nav>
<a href="/home">Home (nav, should be filtered out by Readability scope)</a>
</nav>
<article>
<h1>Article Title</h1>
<p>This article references <a href="https://other.com/article">an external article</a>.</p>
<p>And a <a href="/related-post">relative link to a related post</a> on the same site.</p>
<p>Plus a <a href="http://192.168.1.1/admin">private IP link</a> that should be rejected.</p>
<p>And a <a href="https://user:pass@evil.com/">credentials URL</a> that should be rejected.</p>
<p>And a <a href="ftp://files.example.com/">non-http scheme</a> that should be rejected.</p>
</article>
<footer>
<a href="/privacy">Privacy (footer, filtered by Readability scope)</a>
</footer>
</body>
</html>
HTML;
Http::fake([
'example.com/*' => Http::response($html, 200, ['Content-Type' => 'text/html']),
]);
$result = $this->makeAction()('https://example.com/article');
$this->assertSame(CrawlOutcomeEnum::Success, $result->outcome);
$this->assertInstanceOf(Collection::class, $result->outboundLinks);
$this->assertSame(2, $result->outboundLinks->count());
$this->assertContains('https://other.com/article', $result->outboundLinks->all());
$this->assertContains('https://example.com/related-post', $result->outboundLinks->all());
$this->assertNotContains('http://192.168.1.1/admin', $result->outboundLinks->all());
$this->assertNotContains('https://user:pass@evil.com/', $result->outboundLinks->all());
$this->assertNotContains('ftp://files.example.com/', $result->outboundLinks->all());
}
public function test_success_calculates_word_count(): void
{
$html = <<<'HTML'
<!DOCTYPE html>
<html>
<head><title>Word Count Test</title></head>
<body>
<article>
<p>This article body has exactly nine words total here.</p>
</article>
</body>
</html>
HTML;
Http::fake([
'example.com/*' => Http::response($html, 200, ['Content-Type' => 'text/html']),
]);
$result = $this->makeAction()('https://example.com/article');
$this->assertSame(CrawlOutcomeEnum::Success, $result->outcome);
$this->assertSame(9, $result->wordCount);
}
public function test_uppercase_content_type_is_accepted_as_html(): void
{
Http::fake([
'example.com/*' => Http::response(
'<html><head><title>Uppercase CT</title></head><body><p>Content here.</p></body></html>',
200,
['Content-Type' => 'Text/HTML; charset=utf-8'],
),
]);
$result = $this->makeAction()('https://example.com/page');
$this->assertSame(CrawlOutcomeEnum::Success, $result->outcome);
}
public function test_empty_href_is_filtered_from_outbound_links(): void
{
$html = <<<'HTML'
<!DOCTYPE html>
<html>
<head><title>Empty Href Test</title></head>
<body>
<article>
<p>This paragraph has <a href="">an empty href anchor</a> that should be dropped.</p>
</article>
</body>
</html>
HTML;
Http::fake([
'example.com/*' => Http::response($html, 200, ['Content-Type' => 'text/html']),
]);
$result = $this->makeAction()('https://example.com/article');
$this->assertSame(CrawlOutcomeEnum::Success, $result->outcome);
$this->assertSame(0, $result->outboundLinks->count());
}
public function test_fragment_only_href_is_filtered_from_outbound_links(): void
{
$html = <<<'HTML'
<!DOCTYPE html>
<html>
<head><title>Fragment Href Test</title></head>
<body>
<article>
<p>Jump to <a href="#section-2">section 2</a> of this page.</p>
</article>
</body>
</html>
HTML;
Http::fake([
'example.com/*' => Http::response($html, 200, ['Content-Type' => 'text/html']),
]);
$result = $this->makeAction()('https://example.com/article');
$this->assertSame(CrawlOutcomeEnum::Success, $result->outcome);
$this->assertSame(0, $result->outboundLinks->count());
}
public function test_sufficient_text_triggers_language_detection_and_result_propagates(): void
{
// 24 words — above the detection threshold
$body = <<<'HTML'
<!DOCTYPE html>
<html>
<head><title>Language Detection Test</title></head>
<body>
<article>
<p>The quick brown fox jumps over the lazy dog and then runs away into the forest
where many other animals live and play together every single day.</p>
</article>
</body>
</html>
HTML;
Http::fake([
'example.com/*' => Http::response($body, 200, ['Content-Type' => 'text/html']),
]);
$detection = $this->mock(LanguageDetectionService::class);
$detection->shouldReceive('detect')
->once()
->andReturn(['en', 0.95]);
$result = $this->makeAction()('https://example.com/article');
$this->assertSame(CrawlOutcomeEnum::Success, $result->outcome);
$this->assertSame('en', $result->language);
$this->assertSame(0.95, $result->languageConfidence);
}
public function test_short_body_with_html_lang_attr_skips_service_and_uses_lang_attr(): void
{
// 7 words — below the detection threshold
$html = <<<'HTML'
<!DOCTYPE html>
<html lang="pt-BR">
<head><title>Short Page</title></head>
<body>
<article>
<p>Too short to detect language automatically.</p>
</article>
</body>
</html>
HTML;
Http::fake([
'example.com/*' => Http::response($html, 200, ['Content-Type' => 'text/html']),
]);
$detection = $this->mock(LanguageDetectionService::class);
$detection->shouldReceive('detect')->never();
$result = $this->makeAction()('https://example.com/article');
$this->assertSame(CrawlOutcomeEnum::Success, $result->outcome);
$this->assertSame('pt-BR', $result->language);
$this->assertSame(1.0, $result->languageConfidence);
}
public function test_short_body_with_no_lang_attr_returns_null_language(): void
{
// 7 words — below the detection threshold
$html = <<<'HTML'
<!DOCTYPE html>
<html>
<head><title>Short Page</title></head>
<body>
<article>
<p>Too short to detect language automatically.</p>
</article>
</body>
</html>
HTML;
Http::fake([
'example.com/*' => Http::response($html, 200, ['Content-Type' => 'text/html']),
]);
$detection = $this->mock(LanguageDetectionService::class);
$detection->shouldReceive('detect')->never();
$result = $this->makeAction()('https://example.com/article');
$this->assertSame(CrawlOutcomeEnum::Success, $result->outcome);
$this->assertNull($result->language);
$this->assertNull($result->languageConfidence);
}
public function test_whitespace_only_lang_attr_is_treated_as_absent(): void
{
// 7 words — below the detection threshold; lang attr is blank/whitespace-only
$html = <<<'HTML'
<!DOCTYPE html>
<html lang=" ">
<head><title>Short Page</title></head>
<body>
<article>
<p>Too short to detect language automatically.</p>
</article>
</body>
</html>
HTML;
Http::fake([
'example.com/*' => Http::response($html, 200, ['Content-Type' => 'text/html']),
]);
$detection = $this->mock(LanguageDetectionService::class);
$detection->shouldReceive('detect')->never();
$result = $this->makeAction()('https://example.com/article');
$this->assertSame(CrawlOutcomeEnum::Success, $result->outcome);
$this->assertNull($result->language);
$this->assertNull($result->languageConfidence);
}
public function test_lang_attr_longer_than_35_chars_is_rejected(): void
{
// 7 words — below the detection threshold; lang attr exceeds BCP-47 column width (string(35))
$html = <<<'HTML'
<!DOCTYPE html>
<html lang="aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa-bbb">
<head><title>Short Page</title></head>
<body>
<article>
<p>Too short to detect language automatically.</p>
</article>
</body>
</html>
HTML;
Http::fake([
'example.com/*' => Http::response($html, 200, ['Content-Type' => 'text/html']),
]);
$detection = $this->mock(LanguageDetectionService::class);
$detection->shouldReceive('detect')->never();
$result = $this->makeAction()('https://example.com/article');
$this->assertSame(CrawlOutcomeEnum::Success, $result->outcome);
$this->assertNull($result->language);
$this->assertNull($result->languageConfidence);
}
public function test_low_confidence_detection_falls_through_to_lang_attr(): void
{
// 24 words — above the detection threshold; service returns low-confidence result
$html = <<<'HTML'
<!DOCTYPE html>
<html lang="en-US">
<head><title>Confidence Floor Test</title></head>
<body>
<article>
<p>The quick brown fox jumps over the lazy dog and then runs away into the forest
where many other animals live and play together every single day.</p>
</article>
</body>
</html>
HTML;
Http::fake([
'example.com/*' => Http::response($html, 200, ['Content-Type' => 'text/html']),
]);
$detection = $this->mock(LanguageDetectionService::class);
$detection->shouldReceive('detect')
->once()
->andReturn(['xx', 0.15]);
$result = $this->makeAction()('https://example.com/article');
$this->assertSame(CrawlOutcomeEnum::Success, $result->outcome);
$this->assertSame('en-US', $result->language);
$this->assertSame(1.0, $result->languageConfidence);
}
private function makeAction(): FetchPageAction
{
return app(FetchPageAction::class);
}
}

View file

@ -0,0 +1,133 @@
<?php
declare(strict_types=1);
namespace Tests\Feature\Admin;
use App\Enums\PageStatusEnum;
use App\Models\Page;
use Illuminate\Foundation\Testing\RefreshDatabase;
use Lvl0\FediDiscover\Config\InstanceType;
use Lvl0\FediDiscover\Models\Instance;
use Tests\TestCase;
class InstancesAdminPageTest extends TestCase
{
use RefreshDatabase;
public function test_admin_instances_page_is_accessible(): void
{
$response = $this->get('/admin/instances');
$response->assertStatus(200);
}
public function test_admin_instances_page_shows_each_instance_url_and_last_polled_at(): void
{
$mastodon = Instance::factory()
->type(InstanceType::Mastodon)
->enabled()
->create([
'url' => 'https://mastodon.social',
'last_polled_at' => '2024-06-01 12:00:00',
]);
$lemmy = Instance::factory()
->type(InstanceType::Lemmy)
->enabled()
->create([
'url' => 'https://lemmy.world',
'last_polled_at' => '2024-06-01 13:00:00',
]);
$response = $this->get('/admin/instances');
$response->assertSee($mastodon->url);
$response->assertSee($lemmy->url);
$response->assertSee($mastodon->last_polled_at->toDateString());
$response->assertSee($lemmy->last_polled_at->toDateString());
}
public function test_admin_instances_page_shows_error_count_per_instance(): void
{
$first = Instance::factory()
->type(InstanceType::Mastodon)
->enabled()
->create(['url' => 'https://aardvark.example']);
$second = Instance::factory()
->type(InstanceType::Lemmy)
->enabled()
->create(['url' => 'https://zebra.example']);
// First instance: 3 failed + 2 non-failed pages
Page::factory()
->count(3)
->sequence(fn ($s) => ['url' => "https://aardvark.example/fail-{$s->index}"])
->createQuietly(['instance_id' => $first->id, 'status' => PageStatusEnum::Failed]);
Page::factory()
->count(2)
->sequence(fn ($s) => ['url' => "https://aardvark.example/ok-{$s->index}"])
->createQuietly(['instance_id' => $first->id, 'status' => PageStatusEnum::Fetched]);
// Second instance: 1 failed + 4 non-failed pages
Page::factory()
->count(1)
->sequence(fn ($s) => ['url' => "https://zebra.example/fail-{$s->index}"])
->createQuietly(['instance_id' => $second->id, 'status' => PageStatusEnum::Failed]);
Page::factory()
->count(4)
->sequence(fn ($s) => ['url' => "https://zebra.example/ok-{$s->index}"])
->createQuietly(['instance_id' => $second->id, 'status' => PageStatusEnum::Fetched]);
$response = $this->get('/admin/instances');
// Each error-count cell must render as "{n} errors" — this string cannot
// collide with dates, IDs, or the "URLs" column. The counts (3 and 1)
// are distinct and non-equal so the assertion proves per-row mapping,
// not a leaked total.
$response->assertSeeInOrder([
$first->url,
'3 errors',
$second->url,
'1 errors',
]);
}
public function test_admin_instances_page_shows_url_count_per_instance(): void
{
$first = Instance::factory()
->type(InstanceType::Mastodon)
->enabled()
->create(['url' => 'https://aardvark.example']);
$second = Instance::factory()
->type(InstanceType::Lemmy)
->enabled()
->create(['url' => 'https://zebra.example']);
Page::factory()
->count(7)
->sequence(fn ($s) => ['url' => "https://aardvark.example/page-{$s->index}"])
->createQuietly(['instance_id' => $first->id]);
Page::factory()
->count(2)
->sequence(fn ($s) => ['url' => "https://zebra.example/page-{$s->index}"])
->createQuietly(['instance_id' => $second->id]);
$response = $this->get('/admin/instances');
// Each count cell must render as "{n} URLs" — this string cannot
// collide with dates, IDs, or any other incidental numeric content,
// so the assertion only passes when a real count column is wired in.
$response->assertSeeInOrder([
$first->url,
'7 URLs',
$second->url,
'2 URLs',
]);
}
}

View file

@ -0,0 +1,39 @@
<?php
declare(strict_types=1);
namespace Tests\Feature;
use Tests\TestCase;
class BotPageTest extends TestCase
{
public function test_bot_page_renders_at_public_route(): void
{
$response = $this->get('/bot');
$response->assertStatus(200);
}
public function test_bot_page_contains_user_agent_string(): void
{
$response = $this->get('/bot');
$response->assertSee('TroveBot/0.1 (+https://trove.lvl0.xyz/bot)', escape: false);
}
public function test_bot_page_contains_robots_txt_opt_out_example(): void
{
$response = $this->get('/bot');
$response->assertSee('User-agent: TroveBot', escape: false);
$response->assertSee('Disallow: /', escape: false);
}
public function test_bot_page_links_to_forge_repository(): void
{
$response = $this->get('/bot');
$response->assertSee('https://forge.lvl0.xyz/lvl0/trove', escape: false);
}
}

View file

@ -1,19 +0,0 @@
<?php
namespace Tests\Feature;
// use Illuminate\Foundation\Testing\RefreshDatabase;
use Tests\TestCase;
class ExampleTest extends TestCase
{
/**
* A basic test example.
*/
public function test_the_application_returns_a_successful_response(): void
{
$response = $this->get('/');
$response->assertStatus(200);
}
}

View file

@ -0,0 +1,573 @@
<?php
declare(strict_types=1);
namespace Tests\Feature\Jobs;
use App\Actions\FetchPageAction;
use App\Enums\CrawlOutcomeEnum;
use App\Enums\PageStatusEnum;
use App\Jobs\ProcessCrawlJob;
use App\Models\Page;
use App\Models\PageCrawl;
use App\ValueObjects\FetchResult;
use Carbon\Carbon;
use Illuminate\Foundation\Testing\RefreshDatabase;
use Illuminate\Support\Collection;
use Illuminate\Support\Facades\Cache;
use Illuminate\Support\Facades\Http;
use Illuminate\Support\Facades\Queue;
use Mockery;
use Tests\TestCase;
class ProcessCrawlJobTest extends TestCase
{
use RefreshDatabase;
public function test_creating_a_page_crawl_dispatches_process_crawl_job(): void
{
Queue::fake();
$page = Page::factory()->createQuietly(['url' => 'https://example.com/article']);
PageCrawl::factory()->page($page)->create();
Queue::assertPushed(ProcessCrawlJob::class);
}
public function test_dispatched_job_carries_the_correct_page_crawl(): void
{
Queue::fake();
$page = Page::factory()->createQuietly(['url' => 'https://example.com/article']);
$crawl = PageCrawl::factory()->page($page)->create();
Queue::assertPushed(
ProcessCrawlJob::class,
fn (ProcessCrawlJob $job) => $job->pageCrawl->id === $crawl->id,
);
}
public function test_handle_writes_outcome_to_page_crawl_on_success(): void
{
Queue::fake();
$this->mockFetchPageAction(CrawlOutcomeEnum::Success, statusCode: 200, title: 'Hello', extractedText: 'hi', wordCount: 1);
$page = Page::factory()->createQuietly(['url' => 'https://example.com/article']);
$crawl = PageCrawl::factory()->page($page)->createQuietly();
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
->handle();
$fresh = $crawl->fresh();
$this->assertSame(CrawlOutcomeEnum::Success, $fresh->outcome);
$this->assertNotNull($fresh->completed_at);
$this->assertInstanceOf(Carbon::class, $fresh->completed_at);
$this->assertSame(200, $fresh->status_code);
$this->assertNull($fresh->error_message);
}
public function test_handle_updates_page_to_fetched_on_success(): void
{
Queue::fake();
$this->mockFetchPageAction(CrawlOutcomeEnum::Success, statusCode: 200, title: 'Hello', extractedText: 'hi', wordCount: 1);
$page = Page::factory()->createQuietly(['url' => 'https://example.com/article']);
$crawl = PageCrawl::factory()->page($page)->createQuietly();
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
->handle();
$fresh = $page->fresh();
$this->assertSame(PageStatusEnum::Fetched, $fresh->status);
$this->assertNotNull($fresh->fetched_at);
$this->assertInstanceOf(Carbon::class, $fresh->fetched_at);
$this->assertSame('Hello', $fresh->title);
}
public function test_handle_updates_page_to_rejected_on_rejected_outcome(): void
{
Queue::fake();
$this->mockFetchPageAction(CrawlOutcomeEnum::Rejected, statusCode: 200, errorMessage: 'Unsupported Content-Type: application/pdf');
$page = Page::factory()->createQuietly(['url' => 'https://example.com/brochure.pdf']);
$crawl = PageCrawl::factory()->page($page)->createQuietly();
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
->handle();
$fresh = $page->fresh();
$this->assertSame(PageStatusEnum::Rejected, $fresh->status);
$this->assertNull($fresh->fetched_at);
}
public function test_handle_updates_page_to_failed_on_blocked_4xx(): void
{
Queue::fake();
$this->mockFetchPageAction(CrawlOutcomeEnum::Blocked4xx, statusCode: 404, errorMessage: 'HTTP 404');
$page = Page::factory()->createQuietly(['url' => 'https://example.com/gone']);
$crawl = PageCrawl::factory()->page($page)->createQuietly();
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
->handle();
$fresh = $page->fresh();
$this->assertSame(PageStatusEnum::Failed, $fresh->status);
$this->assertNotNull($fresh->failed_at);
$this->assertInstanceOf(Carbon::class, $fresh->failed_at);
}
public function test_handle_updates_page_to_failed_on_timeout(): void
{
Queue::fake();
$this->mockFetchPageAction(CrawlOutcomeEnum::Timeout, errorMessage: 'Connection timed out after 10 seconds');
$page = Page::factory()->createQuietly(['url' => 'https://example.com/slow']);
$crawl = PageCrawl::factory()->page($page)->createQuietly();
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
->handle();
$fresh = $page->fresh();
$this->assertSame(PageStatusEnum::Failed, $fresh->status);
$this->assertNotNull($fresh->failed_at);
$this->assertInstanceOf(Carbon::class, $fresh->failed_at);
}
public function test_handle_schedules_retry_on_transient_failure(): void
{
Queue::fake();
$this->mockFetchPageAction(CrawlOutcomeEnum::Failed, errorMessage: 'Connection refused');
$page = Page::factory()->createQuietly(['url' => 'https://example.com/unstable']);
$crawl = PageCrawl::factory()->page($page)->createQuietly();
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
->handle();
// A second PageCrawl row (the retry) must have been inserted for the same page
$this->assertSame(2, PageCrawl::where('page_id', $page->id)->count());
// The new row is pending — outcome IS NULL
$retryRow = PageCrawl::where('page_id', $page->id)
->whereNull('outcome')
->first();
$this->assertNotNull($retryRow);
// A delayed ProcessCrawlJob must have been pushed for the retry row
Queue::assertPushed(
ProcessCrawlJob::class,
fn (ProcessCrawlJob $job) => $job->pageCrawl->page_id === $page->id
&& $job->pageCrawl->id === $retryRow->id,
);
}
public function test_handle_does_not_retry_after_three_attempts(): void
{
Queue::fake();
$this->mockFetchPageAction(CrawlOutcomeEnum::Failed, errorMessage: 'Connection refused');
$page = Page::factory()->createQuietly(['url' => 'https://example.com/unreachable']);
// 3 prior attempts already exist — this is the cap
PageCrawl::factory()->page($page)->failed('Connection refused')->createQuietly();
PageCrawl::factory()->page($page)->failed('Connection refused')->createQuietly();
$thirdCrawl = PageCrawl::factory()->page($page)->createQuietly();
app(ProcessCrawlJob::class, ['pageCrawl' => $thirdCrawl])
->handle();
// No 4th row must appear — retry cap reached
$this->assertSame(3, PageCrawl::where('page_id', $page->id)->count());
// No retry job dispatched
Queue::assertNotPushed(ProcessCrawlJob::class);
}
public function test_handle_writes_failed_outcome_to_page_crawl(): void
{
Queue::fake();
$this->mockFetchPageAction(CrawlOutcomeEnum::Failed, errorMessage: 'boom');
$page = Page::factory()->createQuietly(['url' => 'https://example.com/unstable']);
$crawl = PageCrawl::factory()->page($page)->createQuietly();
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
->handle();
$this->assertDatabaseHas('page_crawls', [
'id' => $crawl->id,
'outcome' => CrawlOutcomeEnum::Failed->value,
'status_code' => null,
'error_message' => 'boom',
]);
}
public function test_handle_updates_page_to_failed_on_failed_outcome(): void
{
Queue::fake();
$this->mockFetchPageAction(CrawlOutcomeEnum::Failed, errorMessage: 'Connection refused');
$page = Page::factory()->createQuietly(['url' => 'https://example.com/unreachable']);
$crawl = PageCrawl::factory()->page($page)->createQuietly();
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
->handle();
$this->assertSame(PageStatusEnum::Failed, $page->fresh()->status);
}
public function test_handle_updates_page_to_failed_on_blocked_5xx(): void
{
Queue::fake();
$this->mockFetchPageAction(CrawlOutcomeEnum::Blocked5xx, statusCode: 503, errorMessage: 'HTTP 503');
$page = Page::factory()->createQuietly(['url' => 'https://example.com/overloaded']);
$crawl = PageCrawl::factory()->page($page)->createQuietly();
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
->handle();
$this->assertSame(PageStatusEnum::Failed, $page->fresh()->status);
}
public function test_handle_updates_page_to_failed_on_blocked_robots(): void
{
Queue::fake();
$this->mockFetchPageAction(CrawlOutcomeEnum::BlockedRobots, errorMessage: 'Disallowed by robots.txt');
$page = Page::factory()->createQuietly(['url' => 'https://example.com/private']);
$crawl = PageCrawl::factory()->page($page)->createQuietly();
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
->handle();
$this->assertSame(PageStatusEnum::Failed, $page->fresh()->status);
}
public function test_handle_does_not_register_outbound_links_on_failure(): void
{
Queue::fake();
$this->mockFetchPageAction(
CrawlOutcomeEnum::Failed,
outboundLinks: collect(['https://should-not-be-registered.com/page']),
errorMessage: 'Connection refused',
);
$page = Page::factory()->createQuietly(['url' => 'https://example.com/broken']);
$crawl = PageCrawl::factory()->page($page)->createQuietly();
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
->handle();
$this->assertDatabaseMissing('pages', ['url' => 'https://should-not-be-registered.com/page']);
$this->assertSame(1, Page::count());
}
public function test_handle_registers_outbound_links_on_success(): void
{
Queue::fake();
$this->mockFetchPageAction(
CrawlOutcomeEnum::Success,
statusCode: 200,
finalUrl: 'https://source.com/article',
title: 'Source Article',
extractedText: 'some text',
outboundLinks: collect(['https://other.com/article-1', 'https://another.com/post-2']),
wordCount: 2,
);
$page = Page::factory()->createQuietly(['url' => 'https://source.com/article']);
$crawl = PageCrawl::factory()->page($page)->createQuietly();
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
->handle();
$this->assertDatabaseHas('pages', ['url' => 'https://other.com/article-1']);
$this->assertDatabaseHas('pages', ['url' => 'https://another.com/post-2']);
$this->assertSame(3, Page::count());
}
public function test_handle_releases_job_when_domain_is_locked(): void
{
Queue::fake();
// Pre-acquire the lock so the job sees it as already held
Cache::lock('crawler:domain:example.com', 10)->get();
// The fetcher must NOT be called — the job should bail before reaching it
$fetcher = Mockery::mock(FetchPageAction::class);
$fetcher->shouldNotReceive('__invoke');
$this->app->instance(FetchPageAction::class, $fetcher);
$page = Page::factory()->createQuietly(['url' => 'https://example.com/article']);
$crawl = PageCrawl::factory()->page($page)->createQuietly();
$job = new ProcessCrawlJob($crawl);
$job->handle();
// No outcome written — handle() returned early
$this->assertNull($crawl->fresh()->outcome);
// Page status unchanged from its factory default (Discovered)
$this->assertSame(PageStatusEnum::Discovered, $page->fresh()->status);
}
public function test_handle_does_not_release_lock_after_completion(): void
{
Queue::fake();
$this->mockFetchPageAction(CrawlOutcomeEnum::Success, statusCode: 200);
$page = Page::factory()->createQuietly(['url' => 'https://example.com/article']);
$crawl = PageCrawl::factory()->page($page)->createQuietly();
$job = new ProcessCrawlJob($crawl);
$job->handle();
// If handle() called $lock->release(), this second get() would succeed (true).
// It must fail (false) — the lock acquired inside handle() must still be held.
$result = Cache::lock('crawler:domain:example.com', 10)->get();
$this->assertFalse($result, 'Expected the domain lock to still be held after handle() completed, but it was free — the lock was released prematurely.');
}
public function test_handle_writes_blocked_robots_when_disallowed(): void
{
Queue::fake();
Http::fake([
'https://example.com/robots.txt' => Http::response(
"User-agent: *\nDisallow: /",
200,
),
]);
// FetchPageAction must never be called — the robots gate returns before the lock
$fetcher = Mockery::mock(FetchPageAction::class);
$fetcher->shouldNotReceive('__invoke');
$this->app->instance(FetchPageAction::class, $fetcher);
$page = Page::factory()->createQuietly(['url' => 'https://example.com/private']);
$crawl = PageCrawl::factory()->page($page)->createQuietly();
$domain = $crawl->domain;
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
->handle();
// Outcome row must record BlockedRobots
$this->assertDatabaseHas('page_crawls', [
'id' => $crawl->id,
'outcome' => CrawlOutcomeEnum::BlockedRobots->value,
]);
// Page status must be Failed (BlockedRobots::toPageStatus() === Failed)
$this->assertSame(PageStatusEnum::Failed, $page->fresh()->status);
// The politeness lock must still be acquirable — the gate returned before ever claiming it
$this->assertTrue(
Cache::lock("crawler:domain:{$domain}", 10)->get(),
'Expected the politeness lock to be free (gate returned before acquiring it), but it was already held.',
);
}
public function test_handle_acquires_domain_lock_before_fetching(): void
{
Queue::fake();
$this->mockFetchPageAction(CrawlOutcomeEnum::Success, statusCode: 200);
$page = Page::factory()->createQuietly(['url' => 'https://lock-test.example.com/article']);
$crawl = PageCrawl::factory()->page($page)->createQuietly();
$domain = $crawl->domain;
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
->handle();
// The lock must still be held after handle() completes — a second attempt to acquire it fails
$this->assertFalse(
Cache::lock("crawler:domain:{$domain}", 10)->get(),
'Expected the domain lock to still be held after handle() ran, but it was free.',
);
// The fetch ran — outcome was written (proves the lock did not block execution)
$this->assertSame(CrawlOutcomeEnum::Success, $crawl->fresh()->outcome);
}
public function test_handle_proceeds_through_politeness_lock_when_robots_allow(): void
{
Queue::fake();
Http::fake([
'https://example.com/robots.txt' => Http::response(
"User-agent: *\nAllow: /",
200,
),
]);
// FetchPageAction must be called exactly once — robots gate passed, fetch proceeds
$fetcher = Mockery::mock(FetchPageAction::class);
$fetcher->shouldReceive('__invoke')->once()->andReturn(new FetchResult(
outcome: CrawlOutcomeEnum::Success,
statusCode: 200,
finalUrl: 'https://example.com/article',
title: 'Hello',
extractedText: 'hi',
outboundLinks: collect(),
wordCount: 1,
errorMessage: null,
));
$this->app->instance(FetchPageAction::class, $fetcher);
$page = Page::factory()->createQuietly(['url' => 'https://example.com/article']);
$crawl = PageCrawl::factory()->page($page)->createQuietly();
$domain = $crawl->domain;
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
->handle();
// Outcome must be Success — not BlockedRobots
$this->assertDatabaseHas('page_crawls', [
'id' => $crawl->id,
'outcome' => CrawlOutcomeEnum::Success->value,
]);
// Page status must have advanced to Fetched
$this->assertSame(PageStatusEnum::Fetched, $page->fresh()->status);
// Politeness lock must still be held (claimed during the fetch, never released)
$this->assertFalse(
Cache::lock("crawler:domain:{$domain}", 10)->get(),
'Expected the politeness lock to be held after a successful fetch, but it was free.',
);
}
public function test_handle_persists_language_on_success(): void
{
Queue::fake();
$this->mockFetchPageAction(
CrawlOutcomeEnum::Success,
statusCode: 200,
title: 'Hello',
extractedText: 'hi',
wordCount: 1,
language: 'en',
languageConfidence: 0.95,
);
$page = Page::factory()->createQuietly(['url' => 'https://example.com/article']);
$crawl = PageCrawl::factory()->page($page)->createQuietly();
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
->handle();
$fresh = $page->fresh();
$this->assertSame('en', $fresh->language);
$this->assertEqualsWithDelta(0.95, $fresh->language_confidence, 0.001);
}
public function test_handle_does_not_overwrite_existing_language_when_new_fetch_returns_null(): void
{
Queue::fake();
$this->mockFetchPageAction(
CrawlOutcomeEnum::Success,
statusCode: 200,
title: 'Hello',
extractedText: 'hi',
wordCount: 1,
language: null,
languageConfidence: null,
);
// Page already has a language from a previous fetch
$page = Page::factory()->createQuietly([
'url' => 'https://example.com/article',
'language' => 'en',
'language_confidence' => 0.95,
]);
$crawl = PageCrawl::factory()->page($page)->createQuietly();
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
->handle();
$fresh = $page->fresh();
// Language columns must be sticky — null detection must NOT overwrite them
$this->assertSame('en', $fresh->language);
$this->assertEqualsWithDelta(0.95, $fresh->language_confidence, 0.001);
// Other columns must still update — sticky applies to language only
$this->assertSame(PageStatusEnum::Fetched, $fresh->status);
$this->assertSame('Hello', $fresh->title);
}
public function test_handle_leaves_language_null_when_no_prior_and_no_detection(): void
{
Queue::fake();
$this->mockFetchPageAction(
CrawlOutcomeEnum::Success,
statusCode: 200,
title: 'Hello',
extractedText: 'hi',
wordCount: 1,
language: null,
languageConfidence: null,
);
$page = Page::factory()->createQuietly(['url' => 'https://example.com/article']);
$crawl = PageCrawl::factory()->page($page)->createQuietly();
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
->handle();
$fresh = $page->fresh();
$this->assertNull($fresh->language);
$this->assertNull($fresh->language_confidence);
}
private function mockFetchPageAction(
CrawlOutcomeEnum $outcome,
?int $statusCode = null,
?string $finalUrl = 'https://example.com/article',
?string $title = null,
?string $extractedText = null,
?Collection $outboundLinks = null,
?int $wordCount = null,
?string $errorMessage = null,
?string $language = null,
?float $languageConfidence = null,
): void {
$fetcher = Mockery::mock(FetchPageAction::class);
$fetcher->shouldReceive('__invoke')->andReturn(new FetchResult(
outcome: $outcome,
statusCode: $statusCode,
finalUrl: $finalUrl,
title: $title,
extractedText: $extractedText,
outboundLinks: $outboundLinks ?? collect(),
wordCount: $wordCount,
errorMessage: $errorMessage,
language: $language,
languageConfidence: $languageConfidence,
));
$this->app->instance(FetchPageAction::class, $fetcher);
}
}

View file

@ -0,0 +1,52 @@
<?php
declare(strict_types=1);
namespace Tests\Feature\Listeners;
use App\Listeners\PollFailedListener;
use App\Services\PollAlertService;
use Carbon\CarbonImmutable;
use Illuminate\Contracts\Queue\ShouldQueue;
use Illuminate\Foundation\Testing\RefreshDatabase;
use Lvl0\FediDiscover\Config\InstanceType;
use Lvl0\FediDiscover\Events\PollFailed;
use Lvl0\FediDiscover\Models\Instance;
use Mockery;
use Tests\TestCase;
class PollFailedListenerTest extends TestCase
{
use RefreshDatabase;
public function test_handle_calls_record_failure_with_the_event_instance_and_message(): void
{
$instance = Instance::factory()
->type(InstanceType::Mastodon)
->enabled()
->create(['consecutive_poll_failures' => 0]);
$message = 'connection timed out';
$failedAt = CarbonImmutable::now();
$event = new PollFailed($instance, $message, $failedAt);
$service = Mockery::mock(PollAlertService::class);
$service->shouldReceive('recordFailure')
->once()
->with(
Mockery::on(fn (Instance $i) => $i->is($instance)),
$message,
);
$listener = new PollFailedListener($service);
$listener->handle($event);
}
public function test_listener_is_not_queued(): void
{
$this->assertNotInstanceOf(
ShouldQueue::class,
new PollFailedListener($this->createStub(PollAlertService::class)),
);
}
}

View file

@ -0,0 +1,70 @@
<?php
declare(strict_types=1);
namespace Tests\Feature;
use App\Models\Page;
use App\Models\PageCrawl;
use App\Services\UrlService;
use Illuminate\Foundation\Testing\RefreshDatabase;
use Tests\TestCase;
class PageQueuePopulationTest extends TestCase
{
use RefreshDatabase;
public function test_creating_a_page_inserts_a_page_crawl_row(): void
{
$url = 'https://example-blog.com/article';
$page = Page::factory()->create(['url' => $url]);
$expectedDomain = (new UrlService)->host($url);
$this->assertDatabaseHas('page_crawls', [
'page_id' => $page->id,
'domain' => $expectedDomain,
'priority' => 0,
]);
$crawl = PageCrawl::where('page_id', $page->id)->first();
$this->assertNotNull($crawl);
}
public function test_first_or_create_with_existing_url_does_not_insert_duplicate_crawl(): void
{
$url = 'https://example-blog.com/article';
Page::factory()->create(['url' => $url]);
// Finds the existing row — created event does not fire again
Page::firstOrCreate(['url' => $url], ['status' => 'discovered']);
$this->assertDatabaseCount('page_crawls', 1);
}
public function test_updating_a_page_does_not_insert_another_crawl(): void
{
$page = Page::factory()->create(['url' => 'https://example-blog.com/article']);
$page->update(['title' => 'New Title']);
$this->assertDatabaseCount('page_crawls', 1);
}
public function test_bad_url_throws_exception_page_persists_no_crawl_inserted(): void
{
$caught = null;
try {
Page::create(['url' => 'not-a-url', 'status' => 'discovered']);
} catch (\InvalidArgumentException $e) {
$caught = $e;
}
$this->assertNotNull($caught, 'Expected InvalidArgumentException to be thrown');
$this->assertDatabaseHas('pages', ['url' => 'not-a-url']);
$this->assertDatabaseCount('page_crawls', 0);
}
}

View file

@ -0,0 +1,37 @@
<?php
declare(strict_types=1);
namespace Tests\Feature;
use Illuminate\Foundation\Testing\RefreshDatabase;
use Illuminate\Support\Facades\Http;
use Lvl0\FediDiscover\Actions\PollFediverseAction;
use Lvl0\FediDiscover\Config\InstanceType;
use Lvl0\FediDiscover\Models\Instance;
use RuntimeException;
use Tests\TestCase;
class PollFailedIntegrationTest extends TestCase
{
use RefreshDatabase;
public function test_poll_failure_increments_consecutive_poll_failures_via_full_chain(): void
{
Http::fake();
$instance = Instance::factory()
->type(InstanceType::Mastodon)
->enabled()
->create(['consecutive_poll_failures' => 0]);
$this->mock(PollFediverseAction::class)
->shouldReceive('execute')
->once()
->andThrow(new RuntimeException('connection refused'));
$this->artisan('fedi-discover:poll');
$this->assertSame(1, $instance->fresh()->consecutive_poll_failures);
}
}

View file

@ -0,0 +1,171 @@
<?php
declare(strict_types=1);
namespace Tests\Feature\Services;
use App\Services\PollAlertService;
use Illuminate\Foundation\Testing\RefreshDatabase;
use Illuminate\Support\Facades\Http;
use Lvl0\FediDiscover\Config\InstanceType;
use Lvl0\FediDiscover\Models\Instance;
use Tests\TestCase;
class PollAlertServiceTest extends TestCase
{
use RefreshDatabase;
public function test_record_failure_increments_consecutive_poll_failures_on_the_instance(): void
{
$instance = Instance::factory()
->type(InstanceType::Mastodon)
->enabled()
->create(['consecutive_poll_failures' => 0]);
$service = new PollAlertService;
$service->recordFailure($instance, 'test');
$this->assertDatabaseHas('fedi_discover_instances', [
'id' => $instance->id,
'consecutive_poll_failures' => 1,
]);
}
public function test_no_alert_sent_below_threshold(): void
{
Http::fake();
config([
'services.ntfy.url' => 'https://ntfy.example.com',
'services.ntfy.topic' => 'trove-alerts',
'services.ntfy.threshold' => 3,
]);
$instance = Instance::factory()
->type(InstanceType::Mastodon)
->enabled()
->create(['consecutive_poll_failures' => 1]); // will become 2 after recordFailure
$service = new PollAlertService;
$service->recordFailure($instance, 'test');
Http::assertNothingSent();
}
public function test_alert_sent_when_threshold_is_reached(): void
{
Http::fake();
config([
'services.ntfy.url' => 'https://ntfy.example.com',
'services.ntfy.topic' => 'trove-alerts',
'services.ntfy.threshold' => 3,
]);
$instance = Instance::factory()
->type(InstanceType::Mastodon)
->enabled()
->create(['consecutive_poll_failures' => 2]); // will become 3 after recordFailure = exactly at threshold
$service = new PollAlertService;
$service->recordFailure($instance, 'test');
Http::assertSent(function ($request) {
return $request->url() === 'https://ntfy.example.com/trove-alerts'
&& $request->method() === 'POST';
});
}
public function test_alert_sent_when_count_exceeds_threshold(): void
{
Http::fake();
config([
'services.ntfy.url' => 'https://ntfy.example.com',
'services.ntfy.topic' => 'trove-alerts',
'services.ntfy.threshold' => 3,
]);
$instance = Instance::factory()
->type(InstanceType::Mastodon)
->enabled()
->create(['consecutive_poll_failures' => 3]); // will become 4 after recordFailure = above threshold
$service = new PollAlertService;
$service->recordFailure($instance, 'test');
Http::assertSent(function ($request) {
return $request->url() === 'https://ntfy.example.com/trove-alerts'
&& $request->method() === 'POST';
});
}
public function test_no_alert_sent_when_threshold_is_zero(): void
{
Http::fake();
config([
'services.ntfy.url' => 'https://ntfy.example.com',
'services.ntfy.topic' => 'trove-alerts',
'services.ntfy.threshold' => 0,
]);
$instance = Instance::factory()
->type(InstanceType::Mastodon)
->enabled()
->create(['consecutive_poll_failures' => 5]);
$service = new PollAlertService;
$service->recordFailure($instance, 'test');
Http::assertNothingSent();
}
public function test_no_alert_sent_when_topic_is_null(): void
{
Http::fake();
config([
'services.ntfy.url' => 'https://ntfy.example.com',
'services.ntfy.topic' => null,
'services.ntfy.threshold' => 3,
]);
$instance = Instance::factory()
->type(InstanceType::Mastodon)
->enabled()
->create(['consecutive_poll_failures' => 2]); // will become 3 after recordFailure = at threshold
$service = new PollAlertService;
$service->recordFailure($instance, 'test');
Http::assertNothingSent();
}
public function test_alert_body_contains_instance_url_and_message(): void
{
Http::fake();
config([
'services.ntfy.url' => 'https://ntfy.example.com',
'services.ntfy.topic' => 'trove-alerts',
'services.ntfy.threshold' => 3,
]);
$instance = Instance::factory()
->type(InstanceType::Mastodon)
->enabled()
->create([
'url' => 'https://mastodon.social',
'consecutive_poll_failures' => 2, // will become 3 = at threshold
]);
$service = new PollAlertService;
$service->recordFailure($instance, 'connection refused after 3 retries');
Http::assertSent(function ($request) {
return str_contains($request->body(), 'https://mastodon.social')
&& str_contains($request->body(), 'connection refused after 3 retries');
});
}
}

View file

@ -0,0 +1,155 @@
<?php
declare(strict_types=1);
namespace Tests\Feature;
use App\Listeners\UrlDiscoveredListener;
use App\Models\Page;
use App\Models\PageLink;
use Carbon\CarbonImmutable;
use Illuminate\Events\CallQueuedListener;
use Illuminate\Foundation\Testing\RefreshDatabase;
use Illuminate\Support\Facades\Queue;
use Lvl0\FediDiscover\Config\InstanceType;
use Lvl0\FediDiscover\Events\UrlDiscovered;
use Lvl0\FediDiscover\Models\Instance;
use Tests\TestCase;
class UrlDiscoveryTest extends TestCase
{
use RefreshDatabase;
// ---------------------------------------------------------------------------
// Helpers
// ---------------------------------------------------------------------------
private function makeInstance(): Instance
{
return Instance::factory()
->type(InstanceType::Mastodon)
->enabled()
->create();
}
private function makeEvent(Instance $instance, array $overrides = []): UrlDiscovered
{
return new UrlDiscovered(
url: $overrides['url'] ?? 'https://example-blog.com/article',
instanceId: $overrides['instanceId'] ?? $instance->id,
discoveredAt: $overrides['discoveredAt'] ?? CarbonImmutable::parse('2026-04-26T12:00:00Z'),
postUrl: array_key_exists('postUrl', $overrides) ? $overrides['postUrl'] : 'https://mastodon.social/@alice/109876543210',
postBody: array_key_exists('postBody', $overrides) ? $overrides['postBody'] : 'check this out https://example-blog.com/article',
);
}
// ---------------------------------------------------------------------------
// Test 9 — happy path
// ---------------------------------------------------------------------------
public function test_listener_creates_target_page_and_source_page_with_link(): void
{
$instance = $this->makeInstance();
$discoveredAt = CarbonImmutable::parse('2026-04-26T12:00:00Z');
$event = new UrlDiscovered(
url: 'https://example-blog.com/article',
instanceId: $instance->id,
discoveredAt: $discoveredAt,
postUrl: 'https://mastodon.social/@alice/109876543210',
postBody: 'check this out https://example-blog.com/article',
);
event($event);
// Target page
$targetPage = Page::where('url', 'https://example-blog.com/article')->first();
$this->assertNotNull($targetPage);
// Source page
$sourcePage = Page::where('url', 'https://mastodon.social/@alice/109876543210')->first();
$this->assertNotNull($sourcePage);
// Edge
$link = PageLink::where('source_page_id', $sourcePage->id)
->where('target_page_id', $targetPage->id)
->first();
$this->assertNotNull($link);
}
// ---------------------------------------------------------------------------
// Test 10 — idempotency
// ---------------------------------------------------------------------------
public function test_listener_is_idempotent_on_repeated_event(): void
{
$instance = $this->makeInstance();
$event = $this->makeEvent($instance);
event($event);
event($event);
$this->assertSame(2, Page::count());
$this->assertSame(1, PageLink::count());
}
// ---------------------------------------------------------------------------
// Test 11 — null postUrl: only target page, no edge
// ---------------------------------------------------------------------------
public function test_listener_with_null_post_url_creates_only_target_page(): void
{
$instance = $this->makeInstance();
$event = $this->makeEvent($instance, ['postUrl' => null, 'postBody' => null]);
event($event);
$this->assertSame(1, Page::count());
$this->assertSame(0, PageLink::count());
$targetPage = Page::where('url', 'https://example-blog.com/article')->first();
$this->assertNotNull($targetPage);
}
// ---------------------------------------------------------------------------
// Integration — UrlDiscovered event enqueues crawls for both pages via observer
// ---------------------------------------------------------------------------
public function test_url_discovered_event_enqueues_crawls_via_observer(): void
{
$instance = $this->makeInstance();
$event = new UrlDiscovered(
url: 'https://example-blog.com/article',
instanceId: $instance->id,
discoveredAt: CarbonImmutable::parse('2026-04-26T12:00:00Z'),
postUrl: 'https://mastodon.social/@alice/109876543210',
postBody: 'check this out https://example-blog.com/article',
);
event($event);
// Listener creates 2 pages (target + source); observer fires for each → 2 crawl rows
$this->assertDatabaseCount('page_crawls', 2);
$this->assertDatabaseHas('page_crawls', ['domain' => 'example-blog.com']);
$this->assertDatabaseHas('page_crawls', ['domain' => 'mastodon.social']);
}
// ---------------------------------------------------------------------------
// Test 12 — listener is queued, not run inline
// ---------------------------------------------------------------------------
public function test_listener_is_pushed_to_queue_not_run_inline(): void
{
Queue::fake();
$instance = $this->makeInstance();
$event = $this->makeEvent($instance);
event($event);
Queue::assertPushed(CallQueuedListener::class, function (CallQueuedListener $job): bool {
return $job->class === UrlDiscoveredListener::class;
});
}
}

View file

@ -0,0 +1,158 @@
<?php
declare(strict_types=1);
namespace Tests\Feature;
use App\Enums\PageStatusEnum;
use App\Livewire\UrlSubmissionForm;
use App\Models\Page;
use Illuminate\Foundation\Testing\RefreshDatabase;
use Livewire\Livewire;
use PHPUnit\Framework\Attributes\DataProvider;
use Tests\TestCase;
class UrlSubmissionTest extends TestCase
{
use RefreshDatabase;
// -------------------------------------------------------------------------
// Test 1 — route renders the submission form
// -------------------------------------------------------------------------
public function test_submission_form_renders_at_public_route(): void
{
$response = $this->get('/submit');
$response->assertStatus(200);
$response->assertSeeLivewire('url-submission-form');
}
// -------------------------------------------------------------------------
// Test 2 — valid submission creates a page row as Discovered
// -------------------------------------------------------------------------
public function test_valid_url_submission_creates_page_as_discovered(): void
{
Livewire::test(UrlSubmissionForm::class)
->set('url', 'https://example.com/interesting-post')
->call('submit')
->assertHasNoErrors();
$this->assertDatabaseHas('pages', [
'url' => 'https://example.com/interesting-post',
]);
}
// -------------------------------------------------------------------------
// Test 3 — duplicate submission is idempotent (no second row created)
// -------------------------------------------------------------------------
public function test_duplicate_url_submission_does_not_create_second_page(): void
{
$url = 'https://example.com/seen-before';
Page::factory()->create([
'url' => $url,
'status' => PageStatusEnum::Discovered,
]);
Livewire::test(UrlSubmissionForm::class)
->set('url', $url)
->call('submit')
->assertHasNoErrors();
$this->assertDatabaseCount('pages', 1);
}
// -------------------------------------------------------------------------
// Test 4 — confirmation state echoes submitted URL
// -------------------------------------------------------------------------
public function test_confirmation_state_echoes_submitted_url(): void
{
$url = 'https://example.com/great-article';
Livewire::test(UrlSubmissionForm::class)
->set('url', $url)
->call('submit')
->assertHasNoErrors()
->assertSet('confirmedUrl', $url)
->assertSet('url', '')
->assertSee($url);
}
// -------------------------------------------------------------------------
// Test 5 — empty URL fails validation (regression lock)
// -------------------------------------------------------------------------
public function test_missing_url_fails_validation(): void
{
Livewire::test(UrlSubmissionForm::class)
->set('url', '')
->call('submit')
->assertHasErrors(['url' => 'required']);
}
// -------------------------------------------------------------------------
// Test 6 — invalid URL formats fail validation
// -------------------------------------------------------------------------
#[DataProvider('invalidUrls')]
public function test_invalid_url_formats_fail_validation(string $url): void
{
Livewire::test(UrlSubmissionForm::class)
->set('url', $url)
->call('submit')
->assertHasErrors('url');
}
public static function invalidUrls(): array
{
return [
'no scheme' => ['not-a-url'],
'disallowed scheme' => ['ftp://example.com'],
'javascript scheme' => ['javascript:alert(1)'],
];
}
// -------------------------------------------------------------------------
// Integration — form submission enqueues a crawl via PageObserver
// -------------------------------------------------------------------------
public function test_url_submission_form_enqueues_crawl_via_observer(): void
{
Livewire::test(UrlSubmissionForm::class)
->set('url', 'https://example.com/article')
->call('submit')
->assertHasNoErrors();
$this->assertDatabaseCount('page_crawls', 1);
$this->assertDatabaseHas('page_crawls', ['domain' => 'example.com']);
}
// -------------------------------------------------------------------------
// Test 7 — rate limit blocks the 11th submission within a minute
// -------------------------------------------------------------------------
public function test_rate_limit_blocks_eleventh_submission_within_a_minute(): void
{
// 10 submissions within the limit — each must succeed
for ($i = 1; $i <= 10; $i++) {
Livewire::test(UrlSubmissionForm::class)
->set('url', "https://example.com/post-{$i}")
->call('submit')
->assertHasNoErrors();
}
// 11th submission from the same IP must be blocked, with the message visible
Livewire::test(UrlSubmissionForm::class)
->set('url', 'https://example.com/post-11')
->call('submit')
->assertHasErrors('rate_limit')
->assertSee('Too many submissions');
// The 11th URL must NOT have been persisted
$this->assertDatabaseCount('pages', 10);
}
}

View file

@ -0,0 +1,83 @@
<?php
declare(strict_types=1);
namespace Tests\Unit\Actions;
use App\Actions\RegisterDiscoveredPageAction;
use App\Enums\PageStatusEnum;
use App\Models\Page;
use Illuminate\Foundation\Testing\RefreshDatabase;
use Lvl0\FediDiscover\Config\InstanceType;
use Lvl0\FediDiscover\Models\Instance;
use Tests\TestCase;
class RegisterDiscoveredPageActionTest extends TestCase
{
use RefreshDatabase;
public function test_creates_page_with_url_and_discovered_status(): void
{
$action = new RegisterDiscoveredPageAction;
$page = $action('https://example.com/article');
$this->assertInstanceOf(Page::class, $page);
$this->assertSame('https://example.com/article', $page->url);
$this->assertSame(PageStatusEnum::Discovered, $page->status);
$this->assertNull($page->instance_id);
$this->assertDatabaseHas('pages', ['url' => 'https://example.com/article']);
}
public function test_creates_page_with_provided_instance_id(): void
{
$instance = Instance::factory()
->type(InstanceType::Mastodon)
->enabled()
->create();
$action = new RegisterDiscoveredPageAction;
$page = $action('https://example.com/fediverse-post', instanceId: $instance->id);
$this->assertInstanceOf(Page::class, $page);
$this->assertSame($instance->id, $page->instance_id);
$this->assertDatabaseHas('pages', [
'url' => 'https://example.com/fediverse-post',
'instance_id' => $instance->id,
]);
}
public function test_returns_existing_page_when_url_already_exists(): void
{
$existing = Page::factory()->createQuietly([
'url' => 'https://example.com/seen-before',
'status' => PageStatusEnum::Discovered,
]);
$action = new RegisterDiscoveredPageAction;
$returned = $action('https://example.com/seen-before');
$this->assertSame($existing->id, $returned->id);
$this->assertDatabaseCount('pages', 1);
}
public function test_existing_page_status_not_overwritten_on_duplicate_call(): void
{
Page::factory()->createQuietly([
'url' => 'https://example.com/already-fetched',
'status' => PageStatusEnum::Fetched,
]);
$action = new RegisterDiscoveredPageAction;
$returned = $action('https://example.com/already-fetched');
$this->assertSame(PageStatusEnum::Fetched, $returned->status);
$this->assertDatabaseHas('pages', [
'url' => 'https://example.com/already-fetched',
'status' => PageStatusEnum::Fetched,
]);
}
}

View file

@ -0,0 +1,75 @@
<?php
declare(strict_types=1);
namespace Tests\Unit\Enums;
use App\Enums\CrawlOutcomeEnum;
use App\Enums\PageStatusEnum;
use Tests\TestCase;
class CrawlOutcomeEnumTest extends TestCase
{
public function test_all_expected_cases_exist_with_correct_backing_values(): void
{
$expected = [
'Success' => 'success',
'Failed' => 'failed',
'Timeout' => 'timeout',
'BlockedRobots' => 'blocked_robots',
'Blocked4xx' => 'blocked_4xx',
'Blocked5xx' => 'blocked_5xx',
'Rejected' => 'rejected',
];
foreach ($expected as $caseName => $backingValue) {
$case = CrawlOutcomeEnum::from($backingValue);
$this->assertSame($caseName, $case->name, "Case name for '{$backingValue}' should be '{$caseName}'");
$this->assertSame($backingValue, $case->value, "Backing value for '{$caseName}' should be '{$backingValue}'");
}
}
public function test_enum_has_exactly_seven_cases(): void
{
$this->assertCount(7, CrawlOutcomeEnum::cases());
}
public function test_to_page_status_maps_each_outcome_correctly(): void
{
$this->assertSame(PageStatusEnum::Fetched, CrawlOutcomeEnum::Success->toPageStatus());
$this->assertSame(PageStatusEnum::Rejected, CrawlOutcomeEnum::Rejected->toPageStatus());
$this->assertSame(PageStatusEnum::Failed, CrawlOutcomeEnum::Failed->toPageStatus());
$this->assertSame(PageStatusEnum::Failed, CrawlOutcomeEnum::Timeout->toPageStatus());
$this->assertSame(PageStatusEnum::Failed, CrawlOutcomeEnum::Blocked4xx->toPageStatus());
$this->assertSame(PageStatusEnum::Failed, CrawlOutcomeEnum::Blocked5xx->toPageStatus());
$this->assertSame(PageStatusEnum::Failed, CrawlOutcomeEnum::BlockedRobots->toPageStatus());
}
public function test_is_retryable_returns_true_only_for_transient_failures(): void
{
// Retryable: transient network/server problems that may resolve later
$this->assertTrue(CrawlOutcomeEnum::Failed->isRetryable());
$this->assertTrue(CrawlOutcomeEnum::Timeout->isRetryable());
$this->assertTrue(CrawlOutcomeEnum::Blocked5xx->isRetryable());
// Not retryable: success (done), permanent failures, or policy decisions
$this->assertFalse(CrawlOutcomeEnum::Success->isRetryable());
$this->assertFalse(CrawlOutcomeEnum::Rejected->isRetryable());
$this->assertFalse(CrawlOutcomeEnum::BlockedRobots->isRetryable());
$this->assertFalse(CrawlOutcomeEnum::Blocked4xx->isRetryable());
}
public function test_should_register_outbound_links_returns_true_only_for_success(): void
{
$this->assertTrue(CrawlOutcomeEnum::Success->shouldRegisterOutboundLinks());
// No links to register on any non-Success outcome
$this->assertFalse(CrawlOutcomeEnum::Failed->shouldRegisterOutboundLinks());
$this->assertFalse(CrawlOutcomeEnum::Timeout->shouldRegisterOutboundLinks());
$this->assertFalse(CrawlOutcomeEnum::Rejected->shouldRegisterOutboundLinks());
$this->assertFalse(CrawlOutcomeEnum::BlockedRobots->shouldRegisterOutboundLinks());
$this->assertFalse(CrawlOutcomeEnum::Blocked4xx->shouldRegisterOutboundLinks());
$this->assertFalse(CrawlOutcomeEnum::Blocked5xx->shouldRegisterOutboundLinks());
}
}

View file

@ -0,0 +1,33 @@
<?php
declare(strict_types=1);
namespace Tests\Unit\Enums;
use App\Enums\PageStatusEnum;
use Tests\TestCase;
class PageStatusEnumTest extends TestCase
{
public function test_all_expected_cases_exist_with_correct_backing_values(): void
{
$expected = [
'Discovered' => 'discovered',
'Fetched' => 'fetched',
'Failed' => 'failed',
'Rejected' => 'rejected',
];
foreach ($expected as $caseName => $backingValue) {
$case = PageStatusEnum::from($backingValue);
$this->assertSame($caseName, $case->name, "Case name for '{$backingValue}' should be '{$caseName}'");
$this->assertSame($backingValue, $case->value, "Backing value for '{$caseName}' should be '{$backingValue}'");
}
}
public function test_enum_has_exactly_four_cases(): void
{
$this->assertCount(4, PageStatusEnum::cases());
}
}

View file

@ -1,16 +0,0 @@
<?php
namespace Tests\Unit;
use PHPUnit\Framework\TestCase;
class ExampleTest extends TestCase
{
/**
* A basic test example.
*/
public function test_that_true_is_true(): void
{
$this->assertTrue(true);
}
}

View file

@ -0,0 +1,42 @@
<?php
declare(strict_types=1);
namespace Tests\Unit\Models;
use App\Enums\CrawlOutcomeEnum;
use App\Models\Page;
use App\Models\PageCrawl;
use Carbon\Carbon;
use Illuminate\Foundation\Testing\RefreshDatabase;
use Illuminate\Support\Facades\Queue;
use Tests\TestCase;
class PageCrawlFactoryTest extends TestCase
{
use RefreshDatabase;
public function test_factory_successful_state_produces_success_outcome(): void
{
Queue::fake();
$page = Page::factory()->create();
$crawl = PageCrawl::factory()->page($page)->successful()->create();
$this->assertSame(CrawlOutcomeEnum::Success, $crawl->outcome);
$this->assertInstanceOf(Carbon::class, $crawl->completed_at);
$this->assertNull($crawl->error_message);
}
public function test_factory_failed_state_produces_failed_outcome_with_message(): void
{
Queue::fake();
$page = Page::factory()->create();
$crawl = PageCrawl::factory()->page($page)->failed('Connection timed out')->create();
$this->assertSame(CrawlOutcomeEnum::Failed, $crawl->outcome);
$this->assertInstanceOf(Carbon::class, $crawl->completed_at);
$this->assertSame('Connection timed out', $crawl->error_message);
}
}

View file

@ -0,0 +1,111 @@
<?php
declare(strict_types=1);
namespace Tests\Unit\Models;
use App\Enums\CrawlOutcomeEnum;
use App\Models\Page;
use App\Models\PageCrawl;
use Carbon\Carbon;
use Illuminate\Foundation\Testing\RefreshDatabase;
use Illuminate\Support\Facades\Queue;
use Tests\TestCase;
class PageCrawlTest extends TestCase
{
use RefreshDatabase;
public function test_page_crawl_fillable_fields_persist_and_casts_are_applied(): void
{
Queue::fake();
$page = Page::factory()->createQuietly(['url' => 'https://example.com/page-1']);
$completedAt = Carbon::parse('2026-05-01 10:01:05');
$crawl = PageCrawl::create([
'page_id' => $page->id,
'domain' => 'example.com',
'priority' => 5,
'completed_at' => $completedAt,
'outcome' => CrawlOutcomeEnum::Success,
'status_code' => 200,
'error_message' => null,
]);
$fresh = $crawl->fresh();
$this->assertNotNull($fresh);
// domain / priority round-trip
$this->assertSame('example.com', $fresh->domain);
$this->assertSame(5, $fresh->priority);
// outcome is cast to the enum
$this->assertInstanceOf(CrawlOutcomeEnum::class, $fresh->outcome);
$this->assertSame(CrawlOutcomeEnum::Success, $fresh->outcome);
// datetime casts
$this->assertInstanceOf(Carbon::class, $fresh->completed_at);
$this->assertTrue($completedAt->equalTo($fresh->completed_at));
// nullable columns
$this->assertNull($fresh->error_message);
// status_code persists
$this->assertSame(200, $fresh->status_code);
}
public function test_page_crawl_belongs_to_a_page(): void
{
$page = Page::factory()->createQuietly(['url' => 'https://example.com/page-2']);
$crawl = PageCrawl::create([
'page_id' => $page->id,
'domain' => 'example.com',
'priority' => 1,
]);
$related = $crawl->page;
$this->assertInstanceOf(Page::class, $related);
$this->assertSame($page->id, $related->id);
}
public function test_deleting_a_page_cascades_to_its_page_crawls(): void
{
// createQuietly() skips the PageObserver so the count of explicit rows is predictable;
// this test is about cascade delete behaviour, not observer side effects.
$page = Page::factory()->createQuietly(['url' => 'https://example.com/page-cascade']);
PageCrawl::factory()->page($page)->create();
PageCrawl::factory()->page($page)->successful()->create();
PageCrawl::factory()->page($page)->failed('timeout during fetch')->create();
$this->assertSame(3, PageCrawl::count());
$page->delete();
$this->assertSame(0, PageCrawl::count());
}
public function test_pending_crawls_are_filtered_by_null_outcome(): void
{
Queue::fake();
// createQuietly() skips the PageObserver; this test counts rows with null/non-null
// outcome — the auto-inserted observer crawl (outcome=null) would corrupt both counts.
$page = Page::factory()->createQuietly(['url' => 'https://example.com/page-pending']);
$pending = PageCrawl::factory()->page($page)->create();
PageCrawl::factory()->page($page)->successful()->create();
PageCrawl::factory()->page($page)->failed('connection refused')->create();
$this->assertSame(1, PageCrawl::whereNull('outcome')->count());
$this->assertSame($pending->id, PageCrawl::whereNull('outcome')->first()->id);
$this->assertSame(2, PageCrawl::whereNotNull('outcome')->count());
}
}

View file

@ -0,0 +1,52 @@
<?php
declare(strict_types=1);
namespace Tests\Unit\Models;
use App\Models\Page;
use App\Models\PageLink;
use Illuminate\Foundation\Testing\RefreshDatabase;
use Tests\TestCase;
class PageLinkTest extends TestCase
{
use RefreshDatabase;
public function test_page_link_model_fillable_fields_and_relationships(): void
{
$source = Page::factory()->create(['url' => 'https://source.example.com/post/1']);
$target = Page::factory()->create(['url' => 'https://target.example.com/page/2']);
$link = PageLink::create([
'source_page_id' => $source->id,
'target_page_id' => $target->id,
]);
$fresh = $link->fresh();
$this->assertNotNull($fresh);
$this->assertSame($source->id, $fresh->source_page_id);
$this->assertSame($target->id, $fresh->target_page_id);
$this->assertInstanceOf(Page::class, $fresh->sourcePage);
$this->assertSame($source->id, $fresh->sourcePage->id);
$this->assertInstanceOf(Page::class, $fresh->targetPage);
$this->assertSame($target->id, $fresh->targetPage->id);
}
public function test_page_link_factory_with_source_and_target_methods_create_a_link(): void
{
$source = Page::factory()->create(['url' => 'https://source.example.com/post/1']);
$target = Page::factory()->create(['url' => 'https://target.example.com/page/2']);
$link = PageLink::factory()
->withSource($source)
->withTarget($target)
->create();
$this->assertSame($source->id, $link->source_page_id);
$this->assertSame($target->id, $link->target_page_id);
}
}

View file

@ -0,0 +1,195 @@
<?php
declare(strict_types=1);
namespace Tests\Unit\Models;
use App\Enums\PageStatusEnum;
use App\Models\Page;
use App\Models\PageCrawl;
use App\Models\PageLink;
use Carbon\Carbon;
use Illuminate\Foundation\Testing\RefreshDatabase;
use Illuminate\Support\Facades\Queue;
use Lvl0\FediDiscover\Config\InstanceType;
use Lvl0\FediDiscover\Models\Instance;
use Tests\TestCase;
class PageTest extends TestCase
{
use RefreshDatabase;
protected function setUp(): void
{
parent::setUp();
Queue::fake();
}
public function test_page_model_fillable_fields_can_be_mass_assigned(): void
{
$page = Page::create([
'url' => 'https://example.com/article',
'status' => 'discovered',
'title' => 'An Example Article',
'instance_id' => null,
'posted_at' => null,
'fetched_at' => null,
]);
$fresh = $page->fresh();
$this->assertNotNull($fresh);
$this->assertSame('https://example.com/article', $fresh->url);
$this->assertSame('An Example Article', $fresh->title);
$this->assertNull($fresh->instance_id);
}
public function test_page_instance_relationship_returns_the_owning_instance(): void
{
$instance = Instance::factory()
->type(InstanceType::Mastodon)
->enabled()
->create();
$page = Page::create([
'url' => 'https://example.com/post/1',
'status' => 'discovered',
'instance_id' => $instance->id,
]);
$fresh = $page->fresh();
$this->assertInstanceOf(Instance::class, $fresh->instance);
$this->assertSame($instance->id, $fresh->instance->id);
}
public function test_page_outgoing_and_incoming_links_relationships(): void
{
$source = Page::factory()->create(['url' => 'https://example.com/source']);
$target = Page::factory()->create(['url' => 'https://example.com/target']);
PageLink::create([
'source_page_id' => $source->id,
'target_page_id' => $target->id,
]);
$freshSource = $source->fresh();
$freshTarget = $target->fresh();
$this->assertCount(1, $freshSource->outgoingLinks);
$this->assertCount(0, $freshSource->incomingLinks);
$this->assertCount(1, $freshTarget->incomingLinks);
$this->assertCount(0, $freshTarget->outgoingLinks);
$this->assertSame($source->id, $freshTarget->incomingLinks->first()->source_page_id);
$this->assertSame($target->id, $freshSource->outgoingLinks->first()->target_page_id);
}
public function test_page_language_is_fillable_and_persists(): void
{
$page = Page::create([
'url' => 'https://example.com/crawled',
'status' => 'discovered',
'language' => 'en',
]);
$fresh = $page->fresh();
$this->assertNotNull($fresh);
$this->assertSame('en', $fresh->language);
$unset = Page::create([
'url' => 'https://example.com/no-language',
'status' => 'discovered',
]);
$this->assertNull($unset->fresh()->language);
}
public function test_page_has_many_crawls(): void
{
// createQuietly() skips the PageObserver so no auto-crawl row is inserted;
// this test is about HasMany scoping, not observer side effects.
$page = Page::factory()->createQuietly();
$other = Page::factory()->createQuietly();
PageCrawl::create(['page_id' => $page->id, 'domain' => 'example.com']);
PageCrawl::create(['page_id' => $page->id, 'domain' => 'example.com']);
PageCrawl::create(['page_id' => $page->id, 'domain' => 'example.com']);
PageCrawl::create(['page_id' => $other->id, 'domain' => 'other.com']);
$crawls = $page->fresh()->crawls;
$this->assertCount(3, $crawls);
foreach ($crawls as $crawl) {
$this->assertInstanceOf(PageCrawl::class, $crawl);
$this->assertSame($page->id, $crawl->page_id);
}
}
public function test_page_latest_crawl_returns_row_with_latest_created_at(): void
{
// createQuietly() skips the PageObserver; this test is about latestOfMany ordering,
// not observer side effects. Using create() would add an observer crawl whose
// created_at is now(), making the test fragile once the hardcoded sentinel date passes.
$page = Page::factory()->createQuietly();
$old = PageCrawl::create(['page_id' => $page->id, 'domain' => 'example.com']);
$old->created_at = Carbon::parse('2026-01-01 08:00:00');
$old->save();
$middle = PageCrawl::create(['page_id' => $page->id, 'domain' => 'example.com']);
$middle->created_at = Carbon::parse('2026-03-15 12:00:00');
$middle->save();
$newest = PageCrawl::create(['page_id' => $page->id, 'domain' => 'example.com', 'error_message' => 'sentinel-latest']);
$newest->created_at = Carbon::parse('2026-05-10 18:00:00');
$newest->save();
$latest = $page->fresh()->latestCrawl;
$this->assertInstanceOf(PageCrawl::class, $latest);
$this->assertSame('sentinel-latest', $latest->error_message);
}
public function test_language_confidence_is_fillable_nullable_and_cast_to_float(): void
{
// Column must exist, be nullable (null round-trips cleanly), be mass-assignable,
// and the 'float' cast must be applied so we get a PHP float back, not a string.
$withConfidence = Page::factory()->createQuietly([
'language' => 'en',
'language_confidence' => 0.857,
]);
$fresh = $withConfidence->fresh();
$this->assertNotNull($fresh);
$this->assertIsFloat($fresh->language_confidence);
$this->assertEqualsWithDelta(0.857, $fresh->language_confidence, 0.001);
$withoutConfidence = Page::factory()->createQuietly();
$this->assertNull($withoutConfidence->fresh()->language_confidence);
}
public function test_page_status_is_cast_to_enum(): void
{
$cases = [
['string' => 'discovered', 'enum' => PageStatusEnum::Discovered],
['string' => 'fetched', 'enum' => PageStatusEnum::Fetched],
['string' => 'failed', 'enum' => PageStatusEnum::Failed],
];
foreach ($cases as ['string' => $raw, 'enum' => $expected]) {
$page = Page::create([
'url' => 'https://example.com/' . $raw,
'status' => $raw,
]);
$fresh = $page->fresh();
$this->assertInstanceOf(PageStatusEnum::class, $fresh->status, "status '{$raw}' should cast to PageStatusEnum");
$this->assertSame($expected, $fresh->status, "status '{$raw}' should equal PageStatusEnum::{$expected->name}");
}
}
}

View file

@ -0,0 +1,74 @@
<?php
declare(strict_types=1);
namespace Tests\Unit\Services;
use App\Services\LanguageDetectionService;
use Tests\TestCase;
class LanguageDetectionServiceTest extends TestCase
{
private LanguageDetectionService $service;
protected function setUp(): void
{
parent::setUp();
$this->service = new LanguageDetectionService;
}
public function test_detects_english_from_english_paragraph(): void
{
$text = 'The solar system is the gravitationally bound system of the Sun and the
objects that orbit it. Of the bodies that orbit the Sun directly, the largest
are the eight planets, with the remainder being smaller objects, the dwarf
planets and small solar system bodies. Planets and most other large bodies
in the solar system orbit the Sun in the same direction, counterclockwise
when viewed from above the Sun\'s north pole.';
$result = $this->service->detect($text);
$this->assertIsArray($result);
$this->assertCount(2, $result);
$this->assertTrue(
str_starts_with($result[0], 'en'),
"Expected an English-family tag, got '{$result[0]}'.",
);
$this->assertIsFloat($result[1]);
$this->assertGreaterThan(0.0, $result[1]);
$this->assertLessThanOrEqual(1.0, $result[1]);
}
public function test_detects_portuguese_from_portuguese_paragraph(): void
{
$text = 'O sistema solar é o sistema gravitacionalmente ligado composto pelo Sol e
pelos objetos que orbitam ao seu redor. Dos corpos que orbitam o Sol
diretamente, os maiores são os oito planetas, sendo o restante composto por
objetos menores, como planetas anões e corpos menores do sistema solar.
A Terra é o único planeta conhecido a abrigar vida, possuindo uma atmosfera
rica em nitrogênio e oxigênio que sustenta os seres vivos.';
$result = $this->service->detect($text);
$this->assertIsArray($result);
$this->assertCount(2, $result);
$this->assertTrue(
str_starts_with($result[0], 'pt'),
"Expected a Portuguese-family tag, got '{$result[0]}'.",
);
$this->assertIsFloat($result[1]);
$this->assertGreaterThan(0.0, $result[1]);
$this->assertLessThanOrEqual(1.0, $result[1]);
}
public function test_returns_null_for_empty_string(): void
{
$this->assertNull($this->service->detect(''));
}
public function test_returns_null_for_whitespace_only_string(): void
{
$this->assertNull($this->service->detect(' '));
}
}

View file

@ -0,0 +1,56 @@
<?php
declare(strict_types=1);
namespace Tests\Unit\Services;
use App\Services\PolitenessService;
use Illuminate\Support\Facades\Http;
use Tests\TestCase;
class PolitenessServiceTest extends TestCase
{
public function test_min_delay_for_returns_config_default(): void
{
$this->assertSame(10, (new PolitenessService)->minDelayFor('example.com'));
}
public function test_min_delay_for_respects_config_override(): void
{
config()->set('crawler.min_domain_delay_seconds', 30);
$this->assertSame(30, (new PolitenessService)->minDelayFor('example.com'));
}
public function test_min_delay_for_uses_robots_crawl_delay_when_higher(): void
{
Http::fake([
'https://example.com/robots.txt' => Http::response(
// Spatie does exact-token matching (lowercased), so the fixture UA
// must match the full string the service passes to crawlDelayFor().
"User-agent: TroveBot/0.1 (+https://trove.lvl0.xyz/bot)\nCrawl-delay: 30",
200,
),
]);
config()->set('crawler.min_domain_delay_seconds', 10);
config()->set('crawler.user_agent', 'TroveBot/0.1 (+https://trove.lvl0.xyz/bot)');
$this->assertSame(30, app(PolitenessService::class)->minDelayFor('example.com'));
}
public function test_min_delay_for_uses_config_when_higher_than_robots(): void
{
Http::fake([
'https://example.com/robots.txt' => Http::response(
"User-agent: TroveBot/0.1 (+https://trove.lvl0.xyz/bot)\nCrawl-delay: 10",
200,
),
]);
config()->set('crawler.min_domain_delay_seconds', 60);
config()->set('crawler.user_agent', 'TroveBot/0.1 (+https://trove.lvl0.xyz/bot)');
$this->assertSame(60, app(PolitenessService::class)->minDelayFor('example.com'));
}
}

View file

@ -0,0 +1,96 @@
<?php
declare(strict_types=1);
namespace Tests\Unit\Services;
use App\Services\RobotsService;
use Illuminate\Support\Facades\Http;
use Tests\TestCase;
class RobotsServiceTest extends TestCase
{
public function test_is_allowed_returns_true_when_robots_txt_allows_path(): void
{
Http::fake([
'https://example.com/robots.txt' => Http::response(
"User-agent: *\nAllow: /",
200,
),
]);
$service = app(RobotsService::class);
$this->assertTrue($service->isAllowed('https://example.com/article', 'TroveBot/0.1'));
}
public function test_is_allowed_returns_false_when_robots_txt_disallows_path(): void
{
Http::fake([
'https://example.com/robots.txt' => Http::response(
"User-agent: *\nDisallow: /",
200,
),
]);
$service = app(RobotsService::class);
$this->assertFalse($service->isAllowed('https://example.com/article', 'TroveBot/0.1'));
}
public function test_is_allowed_returns_true_when_robots_txt_fetch_fails(): void
{
Http::fake([
'https://example.com/robots.txt' => Http::response('', 500),
]);
$service = app(RobotsService::class);
$this->assertTrue($service->isAllowed('https://example.com/article', 'TroveBot/0.1'));
}
public function test_is_allowed_caches_robots_txt_body_per_host(): void
{
Http::fake([
'https://example.com/robots.txt' => Http::response(
"User-agent: *\nAllow: /",
200,
),
]);
$service = app(RobotsService::class);
$service->isAllowed('https://example.com/article', 'TroveBot/0.1');
$service->isAllowed('https://example.com/another-article', 'TroveBot/0.1');
Http::assertSentCount(1);
}
public function test_crawl_delay_for_returns_parsed_value(): void
{
Http::fake([
'https://example.com/robots.txt' => Http::response(
"User-agent: TroveBot/0.1\nCrawl-delay: 30",
200,
),
]);
$service = app(RobotsService::class);
$this->assertSame(30, $service->crawlDelayFor('example.com', 'TroveBot/0.1'));
}
public function test_crawl_delay_for_returns_null_when_absent(): void
{
Http::fake([
'https://example.com/robots.txt' => Http::response(
"User-agent: *\nDisallow: /private",
200,
),
]);
$service = app(RobotsService::class);
$this->assertNull($service->crawlDelayFor('example.com', 'TroveBot/0.1'));
}
}

View file

@ -0,0 +1,111 @@
<?php
declare(strict_types=1);
namespace Tests\Unit\Services;
use App\Services\UrlService;
use PHPUnit\Framework\Attributes\DataProvider;
use Tests\TestCase;
class UrlServiceTest extends TestCase
{
private UrlService $service;
protected function setUp(): void
{
parent::setUp();
$this->service = new UrlService;
}
// -------------------------------------------------------------------------
// Happy path — simple URL
// -------------------------------------------------------------------------
public function test_extracts_host_from_simple_url(): void
{
$this->assertSame('example.com', $this->service->host('https://example.com'));
}
// -------------------------------------------------------------------------
// Path, query string, and fragment are ignored
// -------------------------------------------------------------------------
#[DataProvider('urlsWithNoise')]
public function test_extracts_host_ignoring_path_query_and_fragment(string $url, string $expectedHost): void
{
$this->assertSame($expectedHost, $this->service->host($url));
}
public static function urlsWithNoise(): array
{
return [
'path only' => ['https://example.com/some/path', 'example.com'],
'path and query' => ['https://example.com/page?q=hello&lang=en', 'example.com'],
'path, query, fragment' => ['https://example.com/page?q=1#section', 'example.com'],
'http scheme with path' => ['http://news.ycombinator.com/item?id=42', 'news.ycombinator.com'],
];
}
// -------------------------------------------------------------------------
// Port number is stripped from the host
// -------------------------------------------------------------------------
public function test_strips_port_from_host(): void
{
$this->assertSame('example.com', $this->service->host('https://example.com:8080/path'));
}
// -------------------------------------------------------------------------
// Host is always returned as lowercase
// -------------------------------------------------------------------------
public function test_lowercases_host(): void
{
$this->assertSame('example.com', $this->service->host('https://EXAMPLE.COM/path'));
}
// -------------------------------------------------------------------------
// Throws on malformed, disallowed, or IP-literal input
// -------------------------------------------------------------------------
#[DataProvider('invalidInputs')]
public function test_throws_on_invalid_input(string $url): void
{
$this->expectException(\InvalidArgumentException::class);
$this->service->host($url);
}
public static function invalidInputs(): array
{
return [
// malformed / missing structure
'empty string' => [''],
'no scheme' => ['example.com/path'],
'scheme only' => ['https://'],
'bare string' => ['not a url at all'],
// disallowed schemes
'javascript scheme' => ['javascript:alert(1)'],
'ftp scheme' => ['ftp://example.com'],
'data scheme' => ['data:text/html,<h1>hi</h1>'],
// IP literals — not valid page-URL hosts for Trove's purposes
'ipv4 literal' => ['https://192.168.1.1/path'],
'ipv6 literal' => ['https://[::1]/path'],
'ipv4 without path' => ['http://10.0.0.1'],
// Embedded credentials (userinfo) — phishing/SSRF flag
'embedded credentials' => ['https://user:pass@example.com/'],
'username only' => ['https://user@example.com/'],
// IPv6 with zone identifier — zone suffix defeats FILTER_VALIDATE_IP
'ipv6 with zone' => ['https://[fe80::1%25eth0]/'],
// IPv4-mapped IPv6 — FILTER_VALIDATE_IP recognises ::ffff:x.x.x.x as valid IPv6
'ipv4 mapped ipv6' => ['https://[::ffff:192.0.2.1]/path'],
];
}
}

Some files were not shown because too many files have changed in this diff Show more