Merge release/0.1.0 into main
Some checks failed
Build and Push Docker Image / build (push) Has been cancelled

This commit is contained in:
myrmidex 2026-04-29 23:29:27 +02:00
commit 7e62cbc613
107 changed files with 7359 additions and 342 deletions

49
.dockerignore Normal file
View file

@ -0,0 +1,49 @@
# Version control
.git
.gitignore
.gitattributes
# Dev environment
shell.nix
Dockerfile.dev
docker/
# Tests (not needed in prod image)
tests/
phpunit.xml
.phpunit.result.cache
phpstan.neon
# Dependencies (rebuilt during image build)
node_modules/
vendor/
# Build artifacts (frontend stage produces these)
public/build/
public/hot
# Editor / OS
.editorconfig
.idea/
.vscode/
.DS_Store
*.swp
*.swo
# Env / secrets
.env
.env.*
!.env.example
# Logs and runtime caches
storage/logs/*.log
storage/framework/cache/data/
storage/framework/sessions/
storage/framework/views/
# CI
.forgejo/
# Docs / project meta
README.md
LICENSE

View file

@ -61,3 +61,9 @@ AWS_BUCKET=
AWS_USE_PATH_STYLE_ENDPOINT=false
VITE_APP_NAME="${APP_NAME}"
CRAWLER_MIN_DOMAIN_DELAY_SECONDS=10
NTFY_URL=
NTFY_TOPIC=
NTFY_THRESHOLD=

View file

@ -5,8 +5,7 @@ on:
branches: [main]
tags: ['v*']
paths:
- 'Dockerfile'
- 'docker/**'
- 'docker/prod/Dockerfile'
- 'app/**'
- 'bootstrap/**'
- 'config/**'
@ -51,6 +50,6 @@ jobs:
uses: https://data.forgejo.org/docker/build-push-action@v5
with:
context: .
file: Dockerfile
file: docker/prod/Dockerfile
push: true
tags: ${{ steps.meta.outputs.tags }}

126
README.md
View file

@ -1,6 +1,128 @@
# trove
# Trove
A small web search engine.
A federated search engine for the small web. Seeded by fediverse attention, ranked by domain coherence rather than commercial authority.
## Tech stack
Laravel 13 · Livewire 4 · PostgreSQL 17 (tsvector FTS) · Redis 7 · FrankenPHP · Vite 8 · Tailwind 4.
## Local development
Requires [Nix](https://nixos.org/download/) and [Podman](https://podman.io/).
```sh
nix-shell # enter dev shell
dev-up # start app, db, redis
```
App: `http://localhost:8200` · Vite HMR: `http://localhost:5175`
Other helpers inside the nix shell: `dev-down`, `dev-rebuild`, `dev-shell`, `dev-artisan <cmd>`, `dev-logs`.
## Self-hosting
Trove ships as a Docker image published to `forge.lvl0.xyz/lvl0/trove`. You provide the compose/stack config.
### Required environment
| Variable | Purpose |
|---|---|
| `APP_KEY` | Laravel app key. Generate with `docker run --rm forge.lvl0.xyz/lvl0/trove:latest php artisan key:generate --show`. **Must persist across deployments** or sessions/encrypted data break. |
| `APP_URL` | Public URL, e.g. `https://trove.example.org` |
| `DB_DATABASE`, `DB_USERNAME`, `DB_PASSWORD` | PostgreSQL credentials |
| `DB_HOST` | Hostname of the PostgreSQL service. Default `db`. Override if your service is named differently. |
| `REDIS_HOST` | Hostname of the Redis service. Default `redis`. Override if your service is named differently. |
### Services you need to provide
- **App**: pull `forge.lvl0.xyz/lvl0/trove:latest` (or a pinned `v*` tag). Exposes port `8000` inside the container. The image runs migrations and warms caches on boot.
- **Worker**: same image as `app`, with `command: php artisan queue:work --tries=3 --max-time=3600`. Processes the crawler queue (URL fetching, content extraction, retries). Crawls won't actually run without this — `app` only enqueues work. **Required for the crawler to function.**
- **PostgreSQL 17**. Hostname must be reachable as `db` (default) or set `DB_HOST`. Persist `/var/lib/postgresql/data`.
- **Redis 7** with `--appendonly yes` (queue jobs persist across restarts). Hostname `redis` or set `REDIS_HOST`.
On first boot the startup script waits for PostgreSQL, warms caches, then runs `php artisan migrate --force` automatically. The 60-second wait loop covers slow PG init; it exits with a clear error if PG never becomes reachable.
### Volumes
- `/app/storage` — Laravel writable paths (logs, cached views, uploads). Persist this.
### Healthcheck
The image exposes `GET /up` (Laravel's built-in health route). The Dockerfile declares a HEALTHCHECK; your orchestrator can use `curl -fsS http://localhost:8000/up` for liveness.
### Example compose stack
A minimal reference — adapt for your infra. DockGE, Portainer, `docker compose`, Kubernetes, and bare `podman play kube` all work with equivalent configs.
```yaml
services:
app:
image: forge.lvl0.xyz/lvl0/trove:latest
restart: always
ports: ["${APP_PORT:-8400}:8000"]
environment:
APP_KEY: "${APP_KEY}"
APP_URL: "${APP_URL}"
DB_DATABASE: "${DB_DATABASE}"
DB_USERNAME: "${DB_USERNAME}"
DB_PASSWORD: "${DB_PASSWORD}"
volumes:
- app_storage:/app/storage
depends_on:
db: { condition: service_healthy }
redis: { condition: service_healthy }
worker:
image: forge.lvl0.xyz/lvl0/trove:latest
restart: always
command: php artisan queue:work --tries=3 --max-time=3600
environment:
APP_KEY: "${APP_KEY}"
APP_URL: "${APP_URL}"
DB_DATABASE: "${DB_DATABASE}"
DB_USERNAME: "${DB_USERNAME}"
DB_PASSWORD: "${DB_PASSWORD}"
volumes:
- app_storage:/app/storage
depends_on:
db: { condition: service_healthy }
redis: { condition: service_healthy }
db:
image: postgres:17-alpine
restart: always
environment:
POSTGRES_DB: "${DB_DATABASE}"
POSTGRES_USER: "${DB_USERNAME}"
POSTGRES_PASSWORD: "${DB_PASSWORD}"
volumes:
- db_data:/var/lib/postgresql/data
healthcheck:
test: ["CMD-SHELL", "pg_isready -U $$POSTGRES_USER -d $$POSTGRES_DB"]
interval: 10s
retries: 5
start_period: 10s
redis:
image: redis:7-alpine
restart: always
command: redis-server --appendonly yes
volumes:
- redis_data:/data
healthcheck:
test: ["CMD", "redis-cli", "ping"]
interval: 10s
retries: 5
volumes:
db_data:
redis_data:
app_storage:
```
### Upgrades
Pull the new image tag, recreate the app container. Migrations run on boot (`php artisan migrate --force` in the startup script). Rollback by pointing at the previous `v*` tag.
----

View file

@ -0,0 +1,194 @@
<?php
declare(strict_types=1);
namespace App\Actions;
use App\Enums\CrawlOutcomeEnum;
use App\Services\LanguageDetectionService;
use App\Services\UrlService;
use App\ValueObjects\FetchResult;
use fivefilters\Readability\Configuration;
use fivefilters\Readability\Readability;
use GuzzleHttp\Exception\ConnectException;
use Illuminate\Http\Client\ConnectionException;
use Illuminate\Http\Client\Factory;
use Illuminate\Http\Client\Response;
use InvalidArgumentException;
use League\Uri\BaseUri;
use Symfony\Component\DomCrawler\Crawler;
use Throwable;
class FetchPageAction
{
private const MIN_WORDS_FOR_TEXT_DETECTION = 20;
private const MIN_TEXT_DETECTION_CONFIDENCE = 0.30;
public function __construct(
private Factory $http,
private UrlService $urlService,
private LanguageDetectionService $languageDetection,
) {}
public function __invoke(string $url): FetchResult
{
try {
$response = $this->http
->timeout(config('crawler.timeout'))
->withHeaders([
'User-Agent' => config('crawler.user_agent'),
'Accept' => 'text/html',
])
->withOptions([
'allow_redirects' => ['max' => config('crawler.max_redirects')],
])
->get($url);
} catch (ConnectionException|ConnectException $e) {
return $this->failureResult($e);
}
[$outcome, $error] = $this->validateResponse($response);
if ($outcome === CrawlOutcomeEnum::Success) {
[$title, $extractedText, $links, $crawler] = $this->extractTitleTextAndLinks($response->body(), $url);
$wordCount = $extractedText !== '' ? count(preg_split('/\s+/u', trim($extractedText))) : 0;
[$language, $languageConfidence] = $this->detectLanguage($crawler, $extractedText, $wordCount);
}
return new FetchResult(
outcome: $outcome,
statusCode: $response->status(),
finalUrl: $url,
title: $title ?? null,
extractedText: $extractedText ?? null,
outboundLinks: $links ?? collect(),
wordCount: $wordCount ?? null,
errorMessage: $error ?? null,
language: $language ?? null,
languageConfidence: $languageConfidence ?? null,
);
}
private function validateResponse(Response $response): array
{
$status = $response->status();
if ($status >= 400 && $status < 500) {
return [CrawlOutcomeEnum::Blocked4xx, "HTTP {$status}"];
}
if ($status >= 500) {
return [CrawlOutcomeEnum::Blocked5xx, "HTTP {$status}"];
}
$contentType = $response->header('Content-Type');
if (! str_starts_with(mb_strtolower($contentType), 'text/html')) {
return [CrawlOutcomeEnum::Rejected, "Unsupported file type: {$contentType}"];
}
return [CrawlOutcomeEnum::Success, null];
}
private function failureResult(ConnectionException|ConnectException $e): FetchResult
{
$guzzleException = $e instanceof ConnectException
? $e
: ($e->getPrevious() instanceof ConnectException
? $e->getPrevious()
: null);
$errno = $guzzleException?->getHandlerContext()['errno'] ?? null;
$outcome = $errno === CURLE_OPERATION_TIMEDOUT
? CrawlOutcomeEnum::Timeout
: CrawlOutcomeEnum::Failed;
return new FetchResult(
outcome: $outcome,
statusCode: null,
finalUrl: null,
title: null,
extractedText: null,
outboundLinks: collect(),
wordCount: null,
errorMessage: $e->getMessage(),
);
}
private function extractTitleTextAndLinks(string $body, string $url): array
{
$crawler = new Crawler($body);
$title = $crawler->filter('title')->count() > 0
? trim($crawler->filter('title')->text())
: null;
$readability = new Readability(new Configuration);
$readability->parse($body);
$mainContent = $readability->getContent() ?? '';
$extractedText = trim(strip_tags($mainContent));
$links = collect();
if ($mainContent !== '') {
$linkCrawler = new Crawler($mainContent);
if ($linkCrawler->filter('a[href]')->count() > 0) {
$links = collect($linkCrawler->filter('a[href]')->extract(['href']));
}
}
$linksResolved = $links
->map(fn (string $href) => $this->resolveAndValidateLink($href, $url))
->filter()
->unique()
->values();
return [$title, $extractedText, $linksResolved, $crawler];
}
private function resolveAndValidateLink(string $href, string $finalUrl): ?string
{
try {
$resolved = (string) BaseUri::from($finalUrl)->resolve($href);
$resolved = strstr($resolved, '#', true) ?: $resolved;
} catch (Throwable) {
return null;
}
if ($resolved === $finalUrl) {
return null;
}
try {
$this->urlService->host($resolved);
} catch (InvalidArgumentException) {
return null;
}
return $resolved;
}
/**
* @return array{0: ?string, 1: ?float}
*/
private function detectLanguage(Crawler $crawler, string $extractedText, ?int $wordCount = null): array
{
if ($wordCount >= self::MIN_WORDS_FOR_TEXT_DETECTION) {
$result = $this->languageDetection->detect($extractedText);
if ($result !== null && $result[1] >= self::MIN_TEXT_DETECTION_CONFIDENCE) {
return [$result[0], $result[1]];
}
}
$lang = $crawler->filter('html')->count() > 0
? trim($crawler->filter('html')->attr('lang') ?? '')
: '';
if ($lang !== '' && strlen($lang) <= 35) {
return [$lang, 1.0];
}
return [null, null];
}
}

View file

@ -0,0 +1,22 @@
<?php
declare(strict_types=1);
namespace App\Actions;
use App\Enums\PageStatusEnum;
use App\Models\Page;
class RegisterDiscoveredPageAction
{
public function __invoke(string $url, ?int $instanceId = null): Page
{
return Page::firstOrCreate(
['url' => $url],
[
'status' => PageStatusEnum::Discovered,
'instance_id' => $instanceId,
],
);
}
}

View file

@ -0,0 +1,60 @@
<?php
declare(strict_types=1);
namespace App\Enums;
enum CrawlOutcomeEnum: string
{
case Success = 'success';
case Failed = 'failed';
case Timeout = 'timeout';
case BlockedRobots = 'blocked_robots';
case Blocked4xx = 'blocked_4xx';
case Blocked5xx = 'blocked_5xx';
/**
* The HTTP fetch succeeded (2xx) but the response is unindexable in v0.1
* (non-HTML Content-Type). Worker MUST also write `pages.status = Rejected`
* on this outcome do NOT treat as Failed. Page row STAYS in the DB to
* prevent re-discovery loops as fediverse re-shares the URL.
*/
case Rejected = 'rejected';
/**
* The PageStatusEnum value the parent `pages` row should land on for this outcome.
*/
public function toPageStatus(): PageStatusEnum
{
return match ($this) {
self::Success => PageStatusEnum::Fetched,
self::Rejected => PageStatusEnum::Rejected,
self::Failed,
self::Timeout,
self::BlockedRobots,
self::Blocked4xx,
self::Blocked5xx => PageStatusEnum::Failed,
};
}
/**
* True if the worker should retry this outcome (transient failures only).
* Permanent failures (4xx, robots block, rejected content type) and successes do not retry.
*/
public function isRetryable(): bool
{
return match ($this) {
self::Failed, self::Timeout, self::Blocked5xx => true,
self::Success, self::Rejected, self::BlockedRobots, self::Blocked4xx => false,
};
}
/**
* True if the worker should register the outbound links discovered during the fetch.
* Only Success outcomes have meaningful links; everything else either failed or returned no usable HTML.
*/
public function shouldRegisterOutboundLinks(): bool
{
return $this === self::Success;
}
}

View file

@ -0,0 +1,20 @@
<?php
declare(strict_types=1);
namespace App\Enums;
enum PageStatusEnum: string
{
case Discovered = 'discovered';
case Fetched = 'fetched';
case Failed = 'failed';
/**
* The crawler fetched the page but rejected it as unindexable in v0.1
* (non-HTML Content-Type). Page row stays as a sentinel preventing
* re-discovery loops; future re-crawl could flip status back to
* Discovered Fetched if the URL starts serving HTML.
*/
case Rejected = 'rejected';
}

View file

@ -0,0 +1,23 @@
<?php
declare(strict_types=1);
namespace App\Http\Controllers\Admin;
use App\Enums\PageStatusEnum;
use App\Http\Controllers\Controller;
use Illuminate\View\View;
use Lvl0\FediDiscover\Models\Instance;
class InstancesController extends Controller
{
public function index(): View
{
$instances = Instance::withCount([
'pages',
'pages as failed_pages_count' => fn ($q) => $q->where('status', PageStatusEnum::Failed),
])->orderBy('url', 'asc')->get();
return view('admin.index', ['instances' => $instances]);
}
}

View file

@ -0,0 +1,127 @@
<?php
declare(strict_types=1);
namespace App\Jobs;
use App\Actions\FetchPageAction;
use App\Actions\RegisterDiscoveredPageAction;
use App\Enums\CrawlOutcomeEnum;
use App\Enums\PageStatusEnum;
use App\Models\PageCrawl;
use App\Services\PolitenessService;
use App\Services\RobotsService;
use App\ValueObjects\FetchResult;
use Illuminate\Contracts\Queue\ShouldQueue;
use Illuminate\Foundation\Queue\Queueable;
use Illuminate\Support\Facades\Cache;
class ProcessCrawlJob implements ShouldQueue
{
use Queueable;
public function __construct(
public PageCrawl $pageCrawl,
) {}
public function handle(): void
{
$robotsService = resolve(RobotsService::class);
if (! $robotsService->isAllowed($this->pageCrawl->page->url)) {
$this->pageCrawl->update([
'outcome' => CrawlOutcomeEnum::BlockedRobots,
'completed_at' => now(),
]);
$this->pageCrawl->page->update(['status' => PageStatusEnum::Failed]);
return;
}
$fetcher = resolve(FetchPageAction::class);
$register = resolve(RegisterDiscoveredPageAction::class);
$politenessService = resolve(PolitenessService::class);
$delay = $politenessService->minDelayFor($this->pageCrawl->domain);
$lock = Cache::lock("crawler:domain:{$this->pageCrawl->domain}", $delay);
if (! $lock->get()) {
$this->release($delay);
return;
}
$result = $fetcher($this->pageCrawl->page->url);
$this->writeOutcome($result);
$this->updatePageStatus($result);
if ($result->outcome->shouldRegisterOutboundLinks()) {
$result->outboundLinks->each(fn (string $url) => $register($url));
}
if ($result->outcome->isRetryable()) {
$this->scheduleRetryIfNeeded();
}
}
private function writeOutcome(FetchResult $result): void
{
$this->pageCrawl->update([
'outcome' => $result->outcome,
'completed_at' => now(),
'status_code' => $result->statusCode,
'error_message' => $result->errorMessage,
]);
}
private function updatePageStatus(FetchResult $result): void
{
$status = $result->outcome->toPageStatus();
$update = match ($status) {
PageStatusEnum::Fetched => [
'status' => $status,
'fetched_at' => now(),
'title' => $result->title,
// Sticky language: only write when detection produced a value, so a re-crawl
// returning null doesn't erase a previously-detected language. Guarding on
// language alone is sufficient because FetchPageAction::detectLanguage()
// always returns the pair as both-null or both-non-null (never mixed).
...($result->language !== null ? [
'language' => $result->language,
'language_confidence' => $result->languageConfidence,
] : []),
],
PageStatusEnum::Failed => [
'status' => $status,
'failed_at' => now(),
],
PageStatusEnum::Rejected => [
'status' => $status,
],
PageStatusEnum::Discovered => [
'status' => $status,
],
};
$this->pageCrawl->page->update($update);
}
private function scheduleRetryIfNeeded(): void
{
if (PageCrawl::where('page_id', $this->pageCrawl->page_id)->count() >= 3) {
return;
}
$newRow = PageCrawl::withoutEvents(
fn () => PageCrawl::create(
array_merge($this->pageCrawl->toArray(), [
'outcome' => null,
])
)
);
ProcessCrawlJob::dispatch($newRow)->delay(now()->addHour());
}
}

View file

@ -0,0 +1,18 @@
<?php
declare(strict_types=1);
namespace App\Listeners;
use App\Services\PollAlertService;
use Lvl0\FediDiscover\Events\PollFailed;
class PollFailedListener
{
public function __construct(private PollAlertService $service) {}
public function handle(PollFailed $event): void
{
$this->service->recordFailure($event->instance, $event->message);
}
}

View file

@ -0,0 +1,36 @@
<?php
declare(strict_types=1);
namespace App\Listeners;
use App\Actions\RegisterDiscoveredPageAction;
use App\Models\PageLink;
use Illuminate\Contracts\Queue\ShouldQueue;
use Illuminate\Support\Facades\DB;
use Lvl0\FediDiscover\Events\UrlDiscovered;
class UrlDiscoveredListener implements ShouldQueue
{
public function __construct(
private RegisterDiscoveredPageAction $registerPage,
) {}
public function handle(UrlDiscovered $event): void
{
DB::transaction(function () use ($event) {
$targetPage = ($this->registerPage)($event->url, $event->instanceId);
if ($event->postUrl === null || $event->postUrl === $event->url) {
return;
}
$sourcePage = ($this->registerPage)($event->postUrl, $event->instanceId);
PageLink::firstOrCreate([
'source_page_id' => $sourcePage->id,
'target_page_id' => $targetPage->id,
]);
});
}
}

View file

@ -0,0 +1,44 @@
<?php
declare(strict_types=1);
namespace App\Livewire;
use App\Actions\RegisterDiscoveredPageAction;
use Illuminate\Contracts\View\View;
use Illuminate\Support\Facades\RateLimiter;
use Livewire\Component;
class UrlSubmissionForm extends Component
{
public string $url = '';
public ?string $confirmedUrl = null;
public function submit(RegisterDiscoveredPageAction $registerPage): void
{
$key = 'submit-url:' . request()->ip();
if (RateLimiter::tooManyAttempts($key, 10)) {
$this->addError('rate_limit', 'Too many submissions, try again shortly.');
return;
}
RateLimiter::hit($key, 60);
$validated = $this->validate([
'url' => ['required', 'url:http,https'],
]);
$registerPage($validated['url']);
$this->confirmedUrl = $validated['url'];
$this->reset('url');
}
public function render(): View
{
return view('livewire.url-submission-form');
}
}

68
app/Models/Page.php Normal file
View file

@ -0,0 +1,68 @@
<?php
declare(strict_types=1);
namespace App\Models;
use App\Enums\PageStatusEnum;
use App\Observers\PageObserver;
use Database\Factories\PageFactory;
use Illuminate\Database\Eloquent\Attributes\ObservedBy;
use Illuminate\Database\Eloquent\Factories\HasFactory;
use Illuminate\Database\Eloquent\Model;
use Illuminate\Database\Eloquent\Relations\BelongsTo;
use Illuminate\Database\Eloquent\Relations\HasMany;
use Illuminate\Database\Eloquent\Relations\HasOne;
use Lvl0\FediDiscover\Models\Instance;
#[ObservedBy([PageObserver::class])]
class Page extends Model
{
/** @use HasFactory<PageFactory> */
use HasFactory;
protected $fillable = [
'url',
'status',
'language',
'language_confidence',
'title',
'instance_id',
'posted_at',
'fetched_at',
'failed_at',
];
protected $casts = [
'status' => PageStatusEnum::class,
'language_confidence' => 'float',
'posted_at' => 'datetime',
'fetched_at' => 'datetime',
'failed_at' => 'datetime',
];
public function instance(): BelongsTo
{
return $this->belongsTo(Instance::class);
}
public function outgoingLinks(): HasMany
{
return $this->hasMany(PageLink::class, 'source_page_id');
}
public function incomingLinks(): HasMany
{
return $this->hasMany(PageLink::class, 'target_page_id');
}
public function crawls(): HasMany
{
return $this->hasMany(PageCrawl::class);
}
public function latestCrawl(): HasOne
{
return $this->hasOne(PageCrawl::class)->latestOfMany('created_at');
}
}

45
app/Models/PageCrawl.php Normal file
View file

@ -0,0 +1,45 @@
<?php
declare(strict_types=1);
namespace App\Models;
use App\Enums\CrawlOutcomeEnum;
use App\Observers\PageCrawlObserver;
use Database\Factories\PageCrawlFactory;
use Illuminate\Database\Eloquent\Attributes\ObservedBy;
use Illuminate\Database\Eloquent\Factories\HasFactory;
use Illuminate\Database\Eloquent\Model;
use Illuminate\Database\Eloquent\Relations\BelongsTo;
#[ObservedBy(PageCrawlObserver::class)]
class PageCrawl extends Model
{
/** @use HasFactory<PageCrawlFactory> */
use HasFactory;
protected $fillable = [
'page_id',
'domain',
'priority',
'completed_at',
'outcome',
'status_code',
'error_message',
];
protected $casts = [
'priority' => 'integer',
'completed_at' => 'datetime',
'outcome' => CrawlOutcomeEnum::class,
'status_code' => 'integer',
];
/**
* @return BelongsTo<Page, $this>
*/
public function page(): BelongsTo
{
return $this->belongsTo(Page::class);
}
}

31
app/Models/PageLink.php Normal file
View file

@ -0,0 +1,31 @@
<?php
declare(strict_types=1);
namespace App\Models;
use Database\Factories\PageLinkFactory;
use Illuminate\Database\Eloquent\Factories\HasFactory;
use Illuminate\Database\Eloquent\Model;
use Illuminate\Database\Eloquent\Relations\BelongsTo;
class PageLink extends Model
{
/** @use HasFactory<PageLinkFactory> */
use HasFactory;
protected $fillable = [
'source_page_id',
'target_page_id',
];
public function sourcePage(): BelongsTo
{
return $this->belongsTo(Page::class, 'source_page_id');
}
public function targetPage(): BelongsTo
{
return $this->belongsTo(Page::class, 'target_page_id');
}
}

View file

@ -0,0 +1,14 @@
<?php
namespace App\Observers;
use App\Jobs\ProcessCrawlJob;
use App\Models\PageCrawl;
class PageCrawlObserver
{
public function created(PageCrawl $pageCrawl): void
{
ProcessCrawlJob::dispatch($pageCrawl);
}
}

View file

@ -0,0 +1,25 @@
<?php
declare(strict_types=1);
namespace App\Observers;
use App\Models\Page;
use App\Models\PageCrawl;
use App\Services\UrlService;
class PageObserver
{
public function __construct(private UrlService $urlService) {}
public function created(Page $page): void
{
PageCrawl::firstOrCreate(
['page_id' => $page->id],
[
'domain' => $this->urlService->host($page->url),
'priority' => 0,
],
);
}
}

View file

@ -2,23 +2,24 @@
namespace App\Providers;
use App\Listeners\PollFailedListener;
use App\Listeners\UrlDiscoveredListener;
use App\Services\LanguageDetectionService;
use Illuminate\Support\Facades\Event;
use Illuminate\Support\ServiceProvider;
use Lvl0\FediDiscover\Events\PollFailed;
use Lvl0\FediDiscover\Events\UrlDiscovered;
class AppServiceProvider extends ServiceProvider
{
/**
* Register any application services.
*/
public function register(): void
{
//
$this->app->singleton(LanguageDetectionService::class);
}
/**
* Bootstrap any application services.
*/
public function boot(): void
{
//
Event::listen(UrlDiscovered::class, UrlDiscoveredListener::class);
Event::listen(PollFailed::class, PollFailedListener::class);
}
}

View file

@ -0,0 +1,39 @@
<?php
declare(strict_types=1);
namespace App\Services;
use LanguageDetection\Language;
class LanguageDetectionService
{
private Language $language;
public function __construct()
{
$this->language = new Language;
}
/**
* @return array{0: string, 1: float}|null
*/
public function detect(string $text): ?array
{
if (trim($text) === '') {
return null;
}
$languages = $this->language->detect($text)->bestResults()->close();
if ($languages === []) {
return null;
}
// bestResults() keeps every candidate within 0.025 of the top score.
// array_key_first picks the highest-ranked one (arsort'd by the library).
$code = array_key_first($languages);
return [$code, $languages[$code]];
}
}

View file

@ -0,0 +1,19 @@
<?php
declare(strict_types=1);
namespace App\Services;
class PolitenessService
{
public function minDelayFor(string $domain): int
{
/** @var RobotsService $robotsService */
$robotsService = resolve(RobotsService::class);
$crawlDelay = $robotsService->crawlDelayFor($domain, config('crawler.user_agent'));
$configValue = config('crawler.min_domain_delay_seconds', 10);
return max($crawlDelay ?? 0, $configValue);
}
}

View file

@ -0,0 +1,38 @@
<?php
declare(strict_types=1);
namespace App\Services;
use Exception;
use Illuminate\Support\Facades\Http;
use Lvl0\FediDiscover\Models\Instance;
class PollAlertService
{
public function recordFailure(Instance $instance, string $message): void
{
$instance->increment('consecutive_poll_failures');
$instance->refresh();
$ntfyUrl = config('services.ntfy.url');
$ntfyThreshold = config('services.ntfy.threshold');
$ntfyTopic = config('services.ntfy.topic');
if ($ntfyUrl === null || $ntfyThreshold === null || $ntfyThreshold === 0 || $ntfyTopic === null) {
return;
}
if ($instance->consecutive_poll_failures < $ntfyThreshold) {
return;
}
try {
Http::timeout(5)
->withBody($instance->url . ' - ' . $message, 'text/plain')
->post(rtrim($ntfyUrl, '/') . '/' . $ntfyTopic);
} catch (Exception $e) {
logger()->warning('ntfy alert failed', ['instance' => $instance->url, 'error' => $e->getMessage()]);
}
}
}

View file

@ -0,0 +1,60 @@
<?php
declare(strict_types=1);
namespace App\Services;
use Illuminate\Http\Client\ConnectionException;
use Illuminate\Support\Facades\Cache;
use Illuminate\Support\Facades\Http;
use Spatie\Robots\RobotsTxt;
class RobotsService
{
public function __construct(
private UrlService $urlService,
) {}
public function isAllowed(string $url, ?string $userAgent = null): bool
{
$host = $this->urlService->host($url);
$path = parse_url($url, PHP_URL_PATH) ?? '/';
$body = Cache::remember(
"crawler:robots:{$host}",
config('crawler.robots_cache_ttl_seconds'),
function () use ($host) {
try {
$response = Http::get("https://{$host}/robots.txt");
return $response->successful() ? $response->body() : '';
} catch (ConnectionException) {
return '';
}
}
);
return (new RobotsTxt($body))->allows($path, $userAgent);
}
public function crawlDelayFor(string $host, string $userAgent): ?int
{
$body = Cache::remember(
"crawler:robots:{$host}",
config('crawler.robots_cache_ttl_seconds'),
function () use ($host) {
try {
$response = Http::get("https://{$host}/robots.txt");
return $response->successful() ? $response->body() : '';
} catch (ConnectionException) {
return '';
}
}
);
$delay = (new RobotsTxt($body))->crawlDelay($userAgent);
return $delay !== null ? (int) $delay : null;
}
}

View file

@ -0,0 +1,40 @@
<?php
declare(strict_types=1);
namespace App\Services;
use Illuminate\Support\Uri;
use InvalidArgumentException;
class UrlService
{
public function host(string $url): string
{
$uri = Uri::of($url);
$scheme = $uri->scheme();
if ($scheme === null || $scheme === '') {
throw new InvalidArgumentException("URL has no scheme: {$url}");
}
if (! in_array($scheme, ['http', 'https'], true)) {
throw new InvalidArgumentException("Invalid URL scheme: {$scheme}");
}
if ($uri->user() !== null) {
throw new InvalidArgumentException("URLs with embedded credentials not allowed: {$url}");
}
$host = $uri->host();
if ($host === null || $host === '') {
throw new InvalidArgumentException("URL has no host: {$url}");
}
$bareHost = preg_replace('/%.*$/', '', trim($host, '[]'));
if (filter_var($bareHost, FILTER_VALIDATE_IP) !== false) {
throw new InvalidArgumentException("IP literal hosts not allowed: {$host}");
}
return mb_strtolower($host);
}
}

View file

@ -0,0 +1,28 @@
<?php
declare(strict_types=1);
namespace App\ValueObjects;
use App\Enums\CrawlOutcomeEnum;
use Illuminate\Support\Collection;
final readonly class FetchResult
{
/**
* @param ?string $finalUrl Set to the request URL in v0.1; true post-redirect URL tracking is deferred (see ticket #12 spec). Downstream consumers MUST NOT trust this field as the post-redirect location until that lands.
* @param Collection<int, string> $outboundLinks
*/
public function __construct(
public CrawlOutcomeEnum $outcome,
public ?int $statusCode,
public ?string $finalUrl,
public ?string $title,
public ?string $extractedText,
public Collection $outboundLinks,
public ?int $wordCount,
public ?string $errorMessage,
public ?string $language = null,
public ?float $languageConfidence = null,
) {}
}

View file

@ -3,15 +3,20 @@
use Illuminate\Foundation\Application;
use Illuminate\Foundation\Configuration\Exceptions;
use Illuminate\Foundation\Configuration\Middleware;
use Illuminate\Http\Request;
return Application::configure(basePath: dirname(__DIR__))
->withRouting(
web: __DIR__.'/../routes/web.php',
commands: __DIR__.'/../routes/console.php',
web: __DIR__ . '/../routes/web.php',
commands: __DIR__ . '/../routes/console.php',
health: '/up',
)
->withMiddleware(function (Middleware $middleware): void {
//
$middleware->trustProxies(
at: '*',
headers: Request::HEADER_X_FORWARDED_FOR
| Request::HEADER_X_FORWARDED_PROTO,
);
})
->withExceptions(function (Exceptions $exceptions): void {
//

View file

@ -16,10 +16,14 @@
],
"require": {
"php": "^8.3",
"fivefilters/readability.php": "^3.3",
"laravel/framework": "^13.0",
"laravel/tinker": "^3.0",
"livewire/livewire": "^4.2",
"lvl0/fedi-discover": "@dev"
"lvl0/fedi-discover": "@dev",
"patrickschur/language-detection": "^5.3",
"spatie/robots-txt": "^2.5",
"symfony/dom-crawler": "^7.4"
},
"require-dev": {
"fakerphp/faker": "^1.23",

387
composer.lock generated
View file

@ -4,7 +4,7 @@
"Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies",
"This file is @generated automatically"
],
"content-hash": "e46e58784ec34415557c78db6bb6c97e",
"content-hash": "4d6e239c94fea8e9511f1e73f05db1df",
"packages": [
{
"name": "brick/math",
@ -508,6 +508,71 @@
],
"time": "2025-03-06T22:45:56+00:00"
},
{
"name": "fivefilters/readability.php",
"version": "v3.3.3",
"source": {
"type": "git",
"url": "https://github.com/fivefilters/readability.php.git",
"reference": "e2ee7b9e49eae89ac7ed2c74b15718100a73b4c8"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/fivefilters/readability.php/zipball/e2ee7b9e49eae89ac7ed2c74b15718100a73b4c8",
"reference": "e2ee7b9e49eae89ac7ed2c74b15718100a73b4c8",
"shasum": ""
},
"require": {
"ext-dom": "*",
"ext-mbstring": "*",
"ext-xml": "*",
"league/uri": "^7.0",
"masterminds/html5": "^2.0",
"php": ">=8.1",
"psr/log": "^1.0 || ^2.0 || ^3.0"
},
"require-dev": {
"monolog/monolog": "^3.0",
"phpunit/phpunit": "^10.0 || ^11.0"
},
"suggest": {
"monolog/monolog": "Allow logging debug information"
},
"type": "library",
"autoload": {
"psr-4": {
"fivefilters\\Readability\\": "src/"
}
},
"notification-url": "https://packagist.org/downloads/",
"license": [
"Apache-2.0"
],
"authors": [
{
"name": "Andres Rey",
"email": "andreskrey@gmail.com",
"role": "Original Developer"
},
{
"name": "Keyvan Minoukadeh",
"email": "keyvan@fivefilters.org",
"homepage": "https://www.fivefilters.org",
"role": "Developer/Maintainer"
}
],
"description": "A PHP port of Readability.js",
"homepage": "https://github.com/fivefilters/readability.php",
"keywords": [
"html",
"readability"
],
"support": {
"issues": "https://github.com/fivefilters/readability.php/issues",
"source": "https://github.com/fivefilters/readability.php/tree/v3.3.3"
},
"time": "2025-04-26T23:45:37+00:00"
},
{
"name": "fruitcake/php-cors",
"version": "v1.4.0",
@ -2102,7 +2167,7 @@
},
{
"name": "lvl0/fedi-discover",
"version": "dev-main",
"version": "dev-release/0.1.0",
"dist": {
"type": "path",
"url": "packages/Lvl0/FediDiscover",
@ -2142,6 +2207,73 @@
"relative": true
}
},
{
"name": "masterminds/html5",
"version": "2.10.0",
"source": {
"type": "git",
"url": "https://github.com/Masterminds/html5-php.git",
"reference": "fcf91eb64359852f00d921887b219479b4f21251"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/Masterminds/html5-php/zipball/fcf91eb64359852f00d921887b219479b4f21251",
"reference": "fcf91eb64359852f00d921887b219479b4f21251",
"shasum": ""
},
"require": {
"ext-dom": "*",
"php": ">=5.3.0"
},
"require-dev": {
"phpunit/phpunit": "^4.8.35 || ^5.7.21 || ^6 || ^7 || ^8 || ^9"
},
"type": "library",
"extra": {
"branch-alias": {
"dev-master": "2.7-dev"
}
},
"autoload": {
"psr-4": {
"Masterminds\\": "src"
}
},
"notification-url": "https://packagist.org/downloads/",
"license": [
"MIT"
],
"authors": [
{
"name": "Matt Butcher",
"email": "technosophos@gmail.com"
},
{
"name": "Matt Farina",
"email": "matt@mattfarina.com"
},
{
"name": "Asmir Mustafic",
"email": "goetas@gmail.com"
}
],
"description": "An HTML5 parser and serializer.",
"homepage": "http://masterminds.github.io/html5-php",
"keywords": [
"HTML5",
"dom",
"html",
"parser",
"querypath",
"serializer",
"xml"
],
"support": {
"issues": "https://github.com/Masterminds/html5-php/issues",
"source": "https://github.com/Masterminds/html5-php/tree/2.10.0"
},
"time": "2025-07-25T09:04:22+00:00"
},
{
"name": "monolog/monolog",
"version": "3.10.0",
@ -2653,6 +2785,57 @@
],
"time": "2026-02-16T23:10:27+00:00"
},
{
"name": "patrickschur/language-detection",
"version": "v5.3.1",
"source": {
"type": "git",
"url": "https://github.com/patrickschur/language-detection.git",
"reference": "df8d32021b2ef9fde52e6fcccb83e3806822c9c6"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/patrickschur/language-detection/zipball/df8d32021b2ef9fde52e6fcccb83e3806822c9c6",
"reference": "df8d32021b2ef9fde52e6fcccb83e3806822c9c6",
"shasum": ""
},
"require": {
"ext-json": "*",
"ext-mbstring": "*",
"php": "^7.4 || ^8.0"
},
"require-dev": {
"phpunit/phpunit": "^9.5.0"
},
"type": "library",
"autoload": {
"psr-4": {
"LanguageDetection\\": "src/LanguageDetection"
}
},
"notification-url": "https://packagist.org/downloads/",
"license": [
"MIT"
],
"authors": [
{
"name": "Patrick Schur",
"email": "patrick_schur@outlook.de"
}
],
"description": "A language detection library for PHP. Detects the language from a given text string.",
"homepage": "https://github.com/patrickschur/language-detection",
"keywords": [
"detect",
"detection",
"language"
],
"support": {
"issues": "https://github.com/patrickschur/language-detection/issues",
"source": "https://github.com/patrickschur/language-detection/tree/v5.3.1"
},
"time": "2025-03-25T22:47:08+00:00"
},
{
"name": "phpoption/phpoption",
"version": "1.9.5",
@ -3417,6 +3600,66 @@
},
"time": "2025-12-14T04:43:48+00:00"
},
{
"name": "spatie/robots-txt",
"version": "2.5.4",
"source": {
"type": "git",
"url": "https://github.com/spatie/robots-txt.git",
"reference": "a8dd35d0a94e863f52509a366a634978e9c1db03"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/spatie/robots-txt/zipball/a8dd35d0a94e863f52509a366a634978e9c1db03",
"reference": "a8dd35d0a94e863f52509a366a634978e9c1db03",
"shasum": ""
},
"require": {
"php": "^8.1"
},
"require-dev": {
"phpunit/phpunit": "^11.5.2"
},
"type": "library",
"autoload": {
"psr-4": {
"Spatie\\Robots\\": "src"
}
},
"notification-url": "https://packagist.org/downloads/",
"license": [
"MIT"
],
"authors": [
{
"name": "Brent Roose",
"email": "brent@spatie.be",
"homepage": "https://spatie.be",
"role": "Developer"
}
],
"description": "Determine if a page may be crawled from robots.txt and robots meta tags",
"homepage": "https://github.com/spatie/robots-txt",
"keywords": [
"robots-txt",
"spatie"
],
"support": {
"issues": "https://github.com/spatie/robots-txt/issues",
"source": "https://github.com/spatie/robots-txt/tree/2.5.4"
},
"funding": [
{
"url": "https://spatie.be/open-source/support-us",
"type": "custom"
},
{
"url": "https://github.com/spatie",
"type": "github"
}
],
"time": "2026-02-25T07:59:20+00:00"
},
{
"name": "symfony/clock",
"version": "v7.4.8",
@ -3729,6 +3972,78 @@
],
"time": "2024-09-25T14:21:43+00:00"
},
{
"name": "symfony/dom-crawler",
"version": "v7.4.8",
"source": {
"type": "git",
"url": "https://github.com/symfony/dom-crawler.git",
"reference": "2918e7c2ba964defca1f5b69c6f74886529e2dc8"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/symfony/dom-crawler/zipball/2918e7c2ba964defca1f5b69c6f74886529e2dc8",
"reference": "2918e7c2ba964defca1f5b69c6f74886529e2dc8",
"shasum": ""
},
"require": {
"masterminds/html5": "^2.6",
"php": ">=8.2",
"symfony/deprecation-contracts": "^2.5|^3",
"symfony/polyfill-ctype": "~1.8",
"symfony/polyfill-mbstring": "~1.0"
},
"require-dev": {
"symfony/css-selector": "^6.4|^7.0|^8.0"
},
"type": "library",
"autoload": {
"psr-4": {
"Symfony\\Component\\DomCrawler\\": ""
},
"exclude-from-classmap": [
"/Tests/"
]
},
"notification-url": "https://packagist.org/downloads/",
"license": [
"MIT"
],
"authors": [
{
"name": "Fabien Potencier",
"email": "fabien@symfony.com"
},
{
"name": "Symfony Community",
"homepage": "https://symfony.com/contributors"
}
],
"description": "Eases DOM navigation for HTML and XML documents",
"homepage": "https://symfony.com",
"support": {
"source": "https://github.com/symfony/dom-crawler/tree/v7.4.8"
},
"funding": [
{
"url": "https://symfony.com/sponsor",
"type": "custom"
},
{
"url": "https://github.com/fabpot",
"type": "github"
},
{
"url": "https://github.com/nicolas-grekas",
"type": "github"
},
{
"url": "https://tidelift.com/funding/github/packagist/symfony/symfony",
"type": "tidelift"
}
],
"time": "2026-03-24T13:12:05+00:00"
},
{
"name": "symfony/error-handler",
"version": "v7.4.8",
@ -4416,7 +4731,7 @@
},
{
"name": "symfony/polyfill-ctype",
"version": "v1.36.0",
"version": "v1.37.0",
"source": {
"type": "git",
"url": "https://github.com/symfony/polyfill-ctype.git",
@ -4475,7 +4790,7 @@
"portable"
],
"support": {
"source": "https://github.com/symfony/polyfill-ctype/tree/v1.36.0"
"source": "https://github.com/symfony/polyfill-ctype/tree/v1.37.0"
},
"funding": [
{
@ -4499,16 +4814,16 @@
},
{
"name": "symfony/polyfill-intl-grapheme",
"version": "v1.36.0",
"version": "v1.37.0",
"source": {
"type": "git",
"url": "https://github.com/symfony/polyfill-intl-grapheme.git",
"reference": "ad1b7b9092976d6c948b8a187cec9faaea9ec1df"
"reference": "4864388bfbd3001ce88e234fab652acd91fdc57e"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/symfony/polyfill-intl-grapheme/zipball/ad1b7b9092976d6c948b8a187cec9faaea9ec1df",
"reference": "ad1b7b9092976d6c948b8a187cec9faaea9ec1df",
"url": "https://api.github.com/repos/symfony/polyfill-intl-grapheme/zipball/4864388bfbd3001ce88e234fab652acd91fdc57e",
"reference": "4864388bfbd3001ce88e234fab652acd91fdc57e",
"shasum": ""
},
"require": {
@ -4557,7 +4872,7 @@
"shim"
],
"support": {
"source": "https://github.com/symfony/polyfill-intl-grapheme/tree/v1.36.0"
"source": "https://github.com/symfony/polyfill-intl-grapheme/tree/v1.37.0"
},
"funding": [
{
@ -4577,11 +4892,11 @@
"type": "tidelift"
}
],
"time": "2026-04-10T16:19:22+00:00"
"time": "2026-04-26T13:13:48+00:00"
},
{
"name": "symfony/polyfill-intl-idn",
"version": "v1.36.0",
"version": "v1.37.0",
"source": {
"type": "git",
"url": "https://github.com/symfony/polyfill-intl-idn.git",
@ -4644,7 +4959,7 @@
"shim"
],
"support": {
"source": "https://github.com/symfony/polyfill-intl-idn/tree/v1.36.0"
"source": "https://github.com/symfony/polyfill-intl-idn/tree/v1.37.0"
},
"funding": [
{
@ -4668,7 +4983,7 @@
},
{
"name": "symfony/polyfill-intl-normalizer",
"version": "v1.36.0",
"version": "v1.37.0",
"source": {
"type": "git",
"url": "https://github.com/symfony/polyfill-intl-normalizer.git",
@ -4729,7 +5044,7 @@
"shim"
],
"support": {
"source": "https://github.com/symfony/polyfill-intl-normalizer/tree/v1.36.0"
"source": "https://github.com/symfony/polyfill-intl-normalizer/tree/v1.37.0"
},
"funding": [
{
@ -4753,7 +5068,7 @@
},
{
"name": "symfony/polyfill-mbstring",
"version": "v1.36.0",
"version": "v1.37.0",
"source": {
"type": "git",
"url": "https://github.com/symfony/polyfill-mbstring.git",
@ -4814,7 +5129,7 @@
"shim"
],
"support": {
"source": "https://github.com/symfony/polyfill-mbstring/tree/v1.36.0"
"source": "https://github.com/symfony/polyfill-mbstring/tree/v1.37.0"
},
"funding": [
{
@ -4838,7 +5153,7 @@
},
{
"name": "symfony/polyfill-php80",
"version": "v1.36.0",
"version": "v1.37.0",
"source": {
"type": "git",
"url": "https://github.com/symfony/polyfill-php80.git",
@ -4898,7 +5213,7 @@
"shim"
],
"support": {
"source": "https://github.com/symfony/polyfill-php80/tree/v1.36.0"
"source": "https://github.com/symfony/polyfill-php80/tree/v1.37.0"
},
"funding": [
{
@ -4922,7 +5237,7 @@
},
{
"name": "symfony/polyfill-php83",
"version": "v1.36.0",
"version": "v1.37.0",
"source": {
"type": "git",
"url": "https://github.com/symfony/polyfill-php83.git",
@ -4978,7 +5293,7 @@
"shim"
],
"support": {
"source": "https://github.com/symfony/polyfill-php83/tree/v1.36.0"
"source": "https://github.com/symfony/polyfill-php83/tree/v1.37.0"
},
"funding": [
{
@ -5002,7 +5317,7 @@
},
{
"name": "symfony/polyfill-php84",
"version": "v1.36.0",
"version": "v1.37.0",
"source": {
"type": "git",
"url": "https://github.com/symfony/polyfill-php84.git",
@ -5058,7 +5373,7 @@
"shim"
],
"support": {
"source": "https://github.com/symfony/polyfill-php84/tree/v1.36.0"
"source": "https://github.com/symfony/polyfill-php84/tree/v1.37.0"
},
"funding": [
{
@ -5082,16 +5397,16 @@
},
{
"name": "symfony/polyfill-php85",
"version": "v1.36.0",
"version": "v1.37.0",
"source": {
"type": "git",
"url": "https://github.com/symfony/polyfill-php85.git",
"reference": "2c408a6bb0313e6001a83628dc5506100474254e"
"reference": "fcfa4973a9917cef23f2e38774da74a2b7d115ee"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/symfony/polyfill-php85/zipball/2c408a6bb0313e6001a83628dc5506100474254e",
"reference": "2c408a6bb0313e6001a83628dc5506100474254e",
"url": "https://api.github.com/repos/symfony/polyfill-php85/zipball/fcfa4973a9917cef23f2e38774da74a2b7d115ee",
"reference": "fcfa4973a9917cef23f2e38774da74a2b7d115ee",
"shasum": ""
},
"require": {
@ -5138,7 +5453,7 @@
"shim"
],
"support": {
"source": "https://github.com/symfony/polyfill-php85/tree/v1.36.0"
"source": "https://github.com/symfony/polyfill-php85/tree/v1.37.0"
},
"funding": [
{
@ -5158,11 +5473,11 @@
"type": "tidelift"
}
],
"time": "2026-04-10T16:50:15+00:00"
"time": "2026-04-26T13:10:57+00:00"
},
{
"name": "symfony/polyfill-uuid",
"version": "v1.36.0",
"version": "v1.37.0",
"source": {
"type": "git",
"url": "https://github.com/symfony/polyfill-uuid.git",
@ -5221,7 +5536,7 @@
"uuid"
],
"support": {
"source": "https://github.com/symfony/polyfill-uuid/tree/v1.36.0"
"source": "https://github.com/symfony/polyfill-uuid/tree/v1.37.0"
},
"funding": [
{
@ -6059,16 +6374,16 @@
},
{
"name": "voku/portable-ascii",
"version": "2.1.0",
"version": "2.1.1",
"source": {
"type": "git",
"url": "https://github.com/voku/portable-ascii.git",
"reference": "d870a33f0f79d2b4579740b0620200221ee44aeb"
"reference": "8e1051fe39379367aecf014f41744ce7539a856f"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/voku/portable-ascii/zipball/d870a33f0f79d2b4579740b0620200221ee44aeb",
"reference": "d870a33f0f79d2b4579740b0620200221ee44aeb",
"url": "https://api.github.com/repos/voku/portable-ascii/zipball/8e1051fe39379367aecf014f41744ce7539a856f",
"reference": "8e1051fe39379367aecf014f41744ce7539a856f",
"shasum": ""
},
"require": {
@ -6105,7 +6420,7 @@
],
"support": {
"issues": "https://github.com/voku/portable-ascii/issues",
"source": "https://github.com/voku/portable-ascii/tree/2.1.0"
"source": "https://github.com/voku/portable-ascii/tree/2.1.1"
},
"funding": [
{
@ -6129,7 +6444,7 @@
"type": "tidelift"
}
],
"time": "2026-04-16T23:10:39+00:00"
"time": "2026-04-26T05:33:54+00:00"
}
],
"packages-dev": [

View file

@ -112,7 +112,7 @@
|
*/
'prefix' => env('CACHE_PREFIX', Str::slug((string) env('APP_NAME', 'laravel')).'-cache-'),
'prefix' => env('CACHE_PREFIX', Str::slug((string) env('APP_NAME', 'laravel')) . '-cache-'),
/*
|--------------------------------------------------------------------------

47
config/crawler.php Normal file
View file

@ -0,0 +1,47 @@
<?php
declare(strict_types=1);
return [
/*
|---------------------------------------------------------------------------
| HTTP timeout (seconds)
|---------------------------------------------------------------------------
|
| Hard cap on a single fetch. Guzzle's default is 0 (wait forever) never
| acceptable for a crawler. Tune up cautiously; longer timeouts amplify the
| impact of slow targets on overall throughput.
|
*/
'timeout' => env('CRAWLER_TIMEOUT', 10),
/*
|---------------------------------------------------------------------------
| Maximum redirects to follow
|---------------------------------------------------------------------------
|
| Guzzle default is 5. Cross-origin redirects are accepted in v0.1 the
| search engine treats the post-redirect URL as the canonical one for
| indexing.
|
*/
'max_redirects' => env('CRAWLER_MAX_REDIRECTS', 5),
/*
|---------------------------------------------------------------------------
| User-Agent
|---------------------------------------------------------------------------
|
| Identifies our crawler to target servers. The placeholder below is for
| v0.1 development; ticket #10 replaces it with the production identity
| and adds a `/bot` info page that the URL points at.
|
*/
'user_agent' => env('CRAWLER_USER_AGENT', 'TroveBot/0.1 (+https://trove.lvl0.xyz/bot)'),
'min_domain_delay_seconds' => env('CRAWLER_MIN_DOMAIN_DELAY_SECONDS', 10),
'robots_cache_ttl_seconds' => env('CRAWLER_ROBOTS_CACHE_TTL_SECONDS', 60 * 60 * 24),
];

View file

@ -149,7 +149,7 @@
'options' => [
'cluster' => env('REDIS_CLUSTER', 'redis'),
'prefix' => env('REDIS_PREFIX', Str::slug((string) env('APP_NAME', 'laravel')).'-database-'),
'prefix' => env('REDIS_PREFIX', Str::slug((string) env('APP_NAME', 'laravel')) . '-database-'),
'persistent' => env('REDIS_PERSISTENT', false),
],

View file

@ -41,7 +41,7 @@
'public' => [
'driver' => 'local',
'root' => storage_path('app/public'),
'url' => rtrim(env('APP_URL', 'http://localhost'), '/').'/storage',
'url' => rtrim(env('APP_URL', 'http://localhost'), '/') . '/storage',
'visibility' => 'public',
'throw' => false,
'report' => false,

282
config/livewire.php Normal file
View file

@ -0,0 +1,282 @@
<?php
return [
/*
|---------------------------------------------------------------------------
| Component Locations
|---------------------------------------------------------------------------
|
| This value sets the root directories that'll be used to resolve view-based
| components like single and multi-file components. The make command will
| use the first directory in this array to add new component files to.
|
*/
'component_locations' => [
resource_path('views/components'),
resource_path('views/livewire'),
],
/*
|---------------------------------------------------------------------------
| Component Namespaces
|---------------------------------------------------------------------------
|
| This value sets default namespaces that will be used to resolve view-based
| components like single-file and multi-file components. These folders'll
| also be referenced when creating new components via the make command.
|
*/
'component_namespaces' => [
'layouts' => resource_path('views/layouts'),
'pages' => resource_path('views/pages'),
],
/*
|---------------------------------------------------------------------------
| Page Layout
|---------------------------------------------------------------------------
| The view that will be used as the layout when rendering a single component as
| an entire page via `Route::livewire('/post/create', 'pages::create-post')`.
| In this case, the content of pages::create-post will render into $slot.
|
*/
'component_layout' => 'layouts::app',
/*
|---------------------------------------------------------------------------
| Lazy Loading Placeholder
|---------------------------------------------------------------------------
| Livewire allows you to lazy load components that would otherwise slow down
| the initial page load. Every component can have a custom placeholder or
| you can define the default placeholder view for all components below.
|
*/
'component_placeholder' => null, // Example: 'placeholders::skeleton'
/*
|---------------------------------------------------------------------------
| Make Command
|---------------------------------------------------------------------------
| This value determines the default configuration for the artisan make command
| You can configure the component type (sfc, mfc, class) and whether to use
| the high-voltage () emoji as a prefix in the sfc|mfc component names.
|
*/
'make_command' => [
'type' => 'class', // Options: 'sfc', 'mfc', 'class'
'emoji' => false, // Options: true, false
'with' => [
'js' => false,
'css' => false,
'test' => false,
],
],
/*
|---------------------------------------------------------------------------
| Class Namespace
|---------------------------------------------------------------------------
|
| This value sets the root class namespace for Livewire component classes in
| your application. This value will change where component auto-discovery
| finds components. It's also referenced by the file creation commands.
|
*/
'class_namespace' => 'App\\Livewire',
/*
|---------------------------------------------------------------------------
| Class Path
|---------------------------------------------------------------------------
|
| This value is used to specify the path where Livewire component class files
| are created when running creation commands like `artisan make:livewire`.
| This path is customizable to match your projects directory structure.
|
*/
'class_path' => app_path('Livewire'),
/*
|---------------------------------------------------------------------------
| View Path
|---------------------------------------------------------------------------
|
| This value is used to specify where Livewire component Blade templates are
| stored when running file creation commands like `artisan make:livewire`.
| It is also used if you choose to omit a component's render() method.
|
*/
'view_path' => resource_path('views/livewire'),
/*
|---------------------------------------------------------------------------
| Temporary File Uploads
|---------------------------------------------------------------------------
|
| Livewire handles file uploads by storing uploads in a temporary directory
| before the file is stored permanently. All file uploads are directed to
| a global endpoint for temporary storage. You may configure this below:
|
*/
'temporary_file_upload' => [
'disk' => env('LIVEWIRE_TEMPORARY_FILE_UPLOAD_DISK'), // Example: 'local', 's3' | Default: 'default'
'rules' => null, // Example: ['file', 'mimes:png,jpg'] | Default: ['required', 'file', 'max:12288'] (12MB)
'directory' => null, // Example: 'tmp' | Default: 'livewire-tmp'
'middleware' => null, // Example: 'throttle:5,1' | Default: 'throttle:60,1'
'preview_mimes' => [ // Supported file types for temporary pre-signed file URLs...
'png', 'gif', 'bmp', 'svg', 'wav', 'mp4',
'mov', 'avi', 'wmv', 'mp3', 'm4a',
'jpg', 'jpeg', 'mpga', 'webp', 'wma',
],
'max_upload_time' => 5, // Max duration (in minutes) before an upload is invalidated...
'cleanup' => true, // Should cleanup temporary uploads older than 24 hrs...
],
/*
|---------------------------------------------------------------------------
| Render On Redirect
|---------------------------------------------------------------------------
|
| This value determines if Livewire will run a component's `render()` method
| after a redirect has been triggered using something like `redirect(...)`
| Setting this to true will render the view once more before redirecting
|
*/
'render_on_redirect' => false,
/*
|---------------------------------------------------------------------------
| Eloquent Model Binding
|---------------------------------------------------------------------------
|
| Previous versions of Livewire supported binding directly to eloquent model
| properties using wire:model by default. However, this behavior has been
| deemed too "magical" and has therefore been put under a feature flag.
|
*/
'legacy_model_binding' => false,
/*
|---------------------------------------------------------------------------
| Auto-inject Frontend Assets
|---------------------------------------------------------------------------
|
| By default, Livewire automatically injects its JavaScript and CSS into the
| <head> and <body> of pages containing Livewire components. By disabling
| this behavior, you need to use @livewireStyles and @livewireScripts.
|
*/
'inject_assets' => true,
/*
|---------------------------------------------------------------------------
| Navigate (SPA mode)
|---------------------------------------------------------------------------
|
| By adding `wire:navigate` to links in your Livewire application, Livewire
| will prevent the default link handling and instead request those pages
| via AJAX, creating an SPA-like effect. Configure this behavior here.
|
*/
'navigate' => [
'show_progress_bar' => true,
'progress_bar_color' => '#2299dd',
],
/*
|---------------------------------------------------------------------------
| HTML Morph Markers
|---------------------------------------------------------------------------
|
| Livewire intelligently "morphs" existing HTML into the newly rendered HTML
| after each update. To make this process more reliable, Livewire injects
| "markers" into the rendered Blade surrounding @if, @class & @foreach.
|
*/
'inject_morph_markers' => true,
/*
|---------------------------------------------------------------------------
| Smart Wire Keys
|---------------------------------------------------------------------------
|
| Livewire uses loops and keys used within loops to generate smart keys that
| are applied to nested components that don't have them. This makes using
| nested components more reliable by ensuring that they all have keys.
|
*/
'smart_wire_keys' => true,
/*
|---------------------------------------------------------------------------
| Pagination Theme
|---------------------------------------------------------------------------
|
| When enabling Livewire's pagination feature by using the `WithPagination`
| trait, Livewire will use Tailwind templates to render pagination views
| on the page. If you want Bootstrap CSS, you can specify: "bootstrap"
|
*/
'pagination_theme' => 'tailwind',
/*
|---------------------------------------------------------------------------
| Release Token
|---------------------------------------------------------------------------
|
| This token is stored client-side and sent along with each request to check
| a users session to see if a new release has invalidated it. If there is
| a mismatch it will throw an error and prompt for a browser refresh.
|
*/
'release_token' => 'a',
/*
|---------------------------------------------------------------------------
| CSP Safe
|---------------------------------------------------------------------------
|
| This config is used to determine if Livewire will use the CSP-safe version
| of Alpine in its bundle. This is useful for applications that are using
| strict Content Security Policy (CSP) to protect against XSS attacks.
|
*/
'csp_safe' => false,
/*
|---------------------------------------------------------------------------
| Payload Guards
|---------------------------------------------------------------------------
|
| These settings protect against malicious or oversized payloads that could
| cause denial of service. The default values should feel reasonable for
| most web applications. Each can be set to null to disable the limit.
|
*/
'payload' => [
'max_size' => 1024 * 1024, // 1MB - maximum request payload size in bytes
'max_nesting_depth' => 10, // Maximum depth of dot-notation property paths
'max_calls' => 50, // Maximum method calls per request
'max_components' => 20, // Maximum components per batch request
],
];

View file

@ -89,7 +89,7 @@
'handler_with' => [
'host' => env('PAPERTRAIL_URL'),
'port' => env('PAPERTRAIL_PORT'),
'connectionString' => 'tls://'.env('PAPERTRAIL_URL').':'.env('PAPERTRAIL_PORT'),
'connectionString' => 'tls://' . env('PAPERTRAIL_URL') . ':' . env('PAPERTRAIL_PORT'),
],
'processors' => [PsrLogMessageProcessor::class],
],

View file

@ -14,6 +14,12 @@
|
*/
'ntfy' => [
'url' => env('NTFY_URL') ?: null,
'topic' => env('NTFY_TOPIC') ?: null,
'threshold' => env('NTFY_THRESHOLD') !== null ? (int) env('NTFY_THRESHOLD') : null,
],
'postmark' => [
'key' => env('POSTMARK_API_KEY'),
],

View file

@ -129,7 +129,7 @@
'cookie' => env(
'SESSION_COOKIE',
Str::slug((string) env('APP_NAME', 'laravel')).'-session'
Str::slug((string) env('APP_NAME', 'laravel')) . '-session'
),
/*

View file

@ -0,0 +1,53 @@
<?php
declare(strict_types=1);
namespace Database\Factories;
use App\Enums\CrawlOutcomeEnum;
use App\Models\Page;
use App\Models\PageCrawl;
use Illuminate\Database\Eloquent\Factories\Factory;
/**
* @extends Factory<PageCrawl>
*/
class PageCrawlFactory extends Factory
{
public function definition(): array
{
return [
'page_id' => null,
'domain' => 'example.com',
'priority' => 0,
'completed_at' => null,
'outcome' => null,
'status_code' => null,
'error_message' => null,
];
}
public function page(Page $page): static
{
return $this->state(fn () => [
'page_id' => $page->id,
]);
}
public function successful(): static
{
return $this->state(fn () => [
'outcome' => CrawlOutcomeEnum::Success,
'completed_at' => now(),
]);
}
public function failed(string $errorMessage): static
{
return $this->state(fn () => [
'outcome' => CrawlOutcomeEnum::Failed,
'completed_at' => now(),
'error_message' => $errorMessage,
]);
}
}

View file

@ -0,0 +1,26 @@
<?php
declare(strict_types=1);
namespace Database\Factories;
use App\Enums\PageStatusEnum;
use App\Models\Page;
use Illuminate\Database\Eloquent\Factories\Factory;
/**
* @extends Factory<Page>
*/
class PageFactory extends Factory
{
/**
* @return array<string, mixed>
*/
public function definition(): array
{
return [
'url' => fake()->url(),
'status' => PageStatusEnum::Discovered,
];
}
}

View file

@ -0,0 +1,34 @@
<?php
declare(strict_types=1);
namespace Database\Factories;
use App\Models\Page;
use App\Models\PageLink;
use Illuminate\Database\Eloquent\Factories\Factory;
/**
* @extends Factory<PageLink>
*/
class PageLinkFactory extends Factory
{
public function definition(): array
{
return [];
}
public function withSource(Page $page): static
{
return $this->state(fn () => [
'source_page_id' => $page->id,
]);
}
public function withTarget(Page $page): static
{
return $this->state(fn () => [
'target_page_id' => $page->id,
]);
}
}

View file

@ -0,0 +1,36 @@
<?php
declare(strict_types=1);
use App\Enums\PageStatusEnum;
use Illuminate\Database\Migrations\Migration;
use Illuminate\Database\Schema\Blueprint;
use Illuminate\Support\Facades\Schema;
return new class extends Migration
{
public function up(): void
{
Schema::create('pages', function (Blueprint $table) {
$table->id();
$table->text('url')->unique();
$table->string('status')->default(PageStatusEnum::Discovered->value)->index();
$table->string('language', 35)->nullable()->index();
$table->decimal('language_confidence', 4, 3)->nullable();
$table->string('title')->nullable();
$table->foreignId('instance_id')
->nullable()
->constrained('fedi_discover_instances')
->nullOnDelete();
$table->timestampTz('posted_at')->nullable();
$table->timestampTz('fetched_at')->nullable();
$table->timestampTz('failed_at')->nullable();
$table->timestampsTz();
});
}
public function down(): void
{
Schema::dropIfExists('pages');
}
};

View file

@ -0,0 +1,27 @@
<?php
declare(strict_types=1);
use Illuminate\Database\Migrations\Migration;
use Illuminate\Database\Schema\Blueprint;
use Illuminate\Support\Facades\Schema;
return new class extends Migration
{
public function up(): void
{
Schema::create('page_links', function (Blueprint $table) {
$table->id();
$table->foreignId('source_page_id')->constrained('pages');
$table->foreignId('target_page_id')->constrained('pages');
$table->timestampsTz();
$table->unique(['source_page_id', 'target_page_id']);
});
}
public function down(): void
{
Schema::dropIfExists('page_links');
}
};

View file

@ -0,0 +1,34 @@
<?php
declare(strict_types=1);
use Illuminate\Database\Migrations\Migration;
use Illuminate\Database\Schema\Blueprint;
use Illuminate\Support\Facades\Schema;
return new class extends Migration
{
public function up(): void
{
Schema::create('page_crawls', function (Blueprint $table) {
$table->id();
$table->foreignId('page_id')
->constrained('pages')
->cascadeOnDelete();
$table->string('domain');
$table->smallInteger('priority')->default(0);
$table->timestampTz('completed_at')->nullable();
$table->string('outcome')->nullable();
$table->smallInteger('status_code')->nullable();
$table->text('error_message')->nullable();
$table->timestampsTz();
$table->index(['page_id', 'created_at']);
});
}
public function down(): void
{
Schema::dropIfExists('page_crawls');
}
};

128
docker/prod/Dockerfile Normal file
View file

@ -0,0 +1,128 @@
# syntax=docker/dockerfile:1
# ============================================================
# Stage 1: Build frontend assets
# ============================================================
FROM node:20-alpine AS frontend
WORKDIR /app
COPY package.json package-lock.json vite.config.js ./
COPY resources/ resources/
RUN npm ci --no-audit --no-fund
RUN npm run build
# ============================================================
# Stage 2: Runtime (FrankenPHP)
# ============================================================
FROM dunglas/frankenphp:1.1-php8.3-alpine AS runtime
RUN apk add --no-cache \
git \
postgresql-client \
curl
RUN install-php-extensions \
pdo_pgsql \
redis \
opcache \
zip \
gd \
intl
COPY --from=composer:2 /usr/bin/composer /usr/bin/composer
WORKDIR /app
ENV APP_ENV=production \
APP_DEBUG=false \
LOG_CHANNEL=stack \
LOG_LEVEL=warning \
DB_CONNECTION=pgsql \
DB_HOST=db \
DB_PORT=5432 \
REDIS_HOST=redis \
REDIS_PORT=6379 \
CACHE_STORE=redis \
QUEUE_CONNECTION=redis \
SESSION_DRIVER=redis \
BROADCAST_CONNECTION=log \
MAIL_MAILER=log
# Copy only the files composer needs before install, so the composer layer stays
# cached when application source changes. packages/ is required because composer.json
# declares it as a path repository.
COPY composer.json composer.lock ./
COPY packages/ packages/
# Skip post-autoload scripts (package:discover) during build — they need a runtime
# Laravel boot which fails without proper env. Discovery happens at runtime via
# start-prod.sh. --classmap-authoritative implies --optimize-autoloader.
RUN composer install --no-dev --no-interaction --prefer-dist --classmap-authoritative --no-scripts
COPY . .
COPY --from=frontend /app/public/build /app/public/build
RUN chown -R www-data:www-data /app/storage /app/bootstrap/cache
RUN cat > /etc/caddy/Caddyfile <<'EOF'
{
frankenphp
order php_server before file_server
}
:8000 {
root * /app/public
php_server {
index index.php
}
encode gzip zstd
file_server
header {
X-Frame-Options "SAMEORIGIN"
X-Content-Type-Options "nosniff"
Referrer-Policy "strict-origin-when-cross-origin"
}
}
EOF
EXPOSE 8000
HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \
CMD curl -fsS http://localhost:8000/up || exit 1
RUN cat > /start-prod.sh <<'EOF'
#!/bin/sh
set -e
echo "Waiting for PostgreSQL at ${DB_HOST}:${DB_PORT}..."
for i in $(seq 1 60); do
if pg_isready -h "${DB_HOST}" -p "${DB_PORT}" -q; then
echo "PostgreSQL is ready."
break
fi
if [ "$i" = "60" ]; then
echo "Timed out waiting for PostgreSQL after 60s." >&2
exit 1
fi
sleep 1
done
php artisan package:discover --ansi
php artisan config:cache
php artisan route:cache
php artisan view:cache
php artisan migrate --force
exec frankenphp run --config /etc/caddy/Caddyfile
EOF
RUN chmod +x /start-prod.sh
CMD ["/start-prod.sh"]

View file

@ -3,5 +3,20 @@
declare(strict_types=1);
return [
// Instance list, polling intervals, and HTTP client config land here.
'http' => [
'timeout' => 10,
// Default points at the project site so fediverse admins can always trace a Trove poller
// back to the project. Operators running their own deployment should override this via
// `php artisan vendor:publish --tag=fedi-discover-config` with their own contact URL.
'user_agent' => 'Trove/1.0 (+https://trove.lvl0.xyz)',
'max_redirects' => 3,
],
'defaults' => [
// Minimum recommended: 60. Mastodon/Lemmy rate limits apply per-instance.
'interval_seconds' => 300,
],
// Instances are DB-managed (table: fedi_discover_instances).
// See the Instance model + admin UI (TBD). No instance list here.
];

View file

@ -0,0 +1,34 @@
<?php
declare(strict_types=1);
use Illuminate\Database\Migrations\Migration;
use Illuminate\Database\Schema\Blueprint;
use Illuminate\Support\Facades\Schema;
return new class extends Migration
{
public function up(): void
{
Schema::create('fedi_discover_instances', function (Blueprint $table) {
$table->id();
$table->string('type');
// Instance origin, e.g. https://mastodon.social. Not a full endpoint path.
$table->string('url');
$table->boolean('enabled')->default(true);
$table->unsignedInteger('interval_seconds')->default(300);
$table->json('extras')->default('{}');
$table->unsignedInteger('consecutive_poll_failures')->default(0);
$table->timestampTz('last_polled_at')->nullable();
$table->string('last_seen_id')->nullable();
$table->timestamps();
$table->unique(['type', 'url']);
});
}
public function down(): void
{
Schema::dropIfExists('fedi_discover_instances');
}
};

View file

@ -0,0 +1,83 @@
<?php
declare(strict_types=1);
namespace Lvl0\FediDiscover\Actions;
use Carbon\CarbonImmutable;
use Illuminate\Support\Facades\Log;
use Lvl0\FediDiscover\Clients\FediverseClientFactory;
use Lvl0\FediDiscover\Events\UrlDiscovered;
use Lvl0\FediDiscover\Models\Instance;
use Lvl0\FediDiscover\ValueObjects\FediversePost;
use Throwable;
class PollFediverseAction
{
public function __construct(private FediverseClientFactory $factory) {}
public function execute(Instance $instance): void
{
$start = microtime(true);
$client = $this->factory->for($instance);
$posts = $client->fetchPostsSince($instance, $instance->last_seen_id);
$urlCount = $posts
->map(function (FediversePost $post) use ($instance) {
try {
return $this->processLinks($post, $instance);
} catch (Throwable $e) {
Log::warning('fedi-discover:processLinks failed', [
'instance_id' => $instance->id,
'instance_url' => $instance->url,
'post_url' => $post->selfUrl,
'exception' => $e::class,
'message' => $e->getMessage(),
]);
}
})
->sum();
if ($posts->isNotEmpty()) {
$instance->last_seen_id = $posts->first()->cursorId;
}
$instance->consecutive_poll_failures = 0;
$instance->last_polled_at = now();
$instance->save();
Log::info('fedi-discover:poll succeeded', [
'instance_id' => $instance->id,
'url_count' => $urlCount,
'duration_ms' => (int) round((microtime(true) - $start) * 1000),
]);
}
private function processLinks(FediversePost $post, Instance $instance): int
{
if ($post->body === null) {
return 0;
}
$linksFound = preg_match_all('~https?://[^\s<>"\'()\[\]]+~', $post->body, $matches);
if ($linksFound === 0) {
return 0;
}
return collect($matches[0])
->map(fn (string $u) => rtrim($u, '.,;:!?'))
->filter(fn (string $u) => filter_var($u, FILTER_VALIDATE_URL) !== false)
->filter(fn (string $u) => parse_url($u, PHP_URL_HOST) !== parse_url($instance->url, PHP_URL_HOST))
->unique()
->each(fn (string $url) => UrlDiscovered::dispatch(
url: $url,
instanceId: $instance->id,
discoveredAt: CarbonImmutable::now(),
postUrl: $post->selfUrl,
postBody: $post->body,
))
->count();
}
}

View file

@ -0,0 +1,24 @@
<?php
declare(strict_types=1);
namespace Lvl0\FediDiscover\Clients;
use Lvl0\FediDiscover\Config\InstanceType;
use Lvl0\FediDiscover\Models\Instance;
class FediverseClientFactory
{
public function __construct(
private MastodonClient $mastodonClient,
private LemmyClient $lemmyClient,
) {}
public function for(Instance $instance): FediverseClientInterface
{
return match ($instance->type) {
InstanceType::Mastodon => $this->mastodonClient,
InstanceType::Lemmy => $this->lemmyClient,
};
}
}

View file

@ -0,0 +1,22 @@
<?php
declare(strict_types=1);
namespace Lvl0\FediDiscover\Clients;
use Illuminate\Support\Collection;
use Lvl0\FediDiscover\Models\Instance;
use Lvl0\FediDiscover\ValueObjects\FediversePost;
interface FediverseClientInterface
{
/**
* Fetch posts newer than the given cursor.
*
* MUST return posts in newest-first order. Callers treat the
* first item as the new high-water mark.
*
* @return Collection<int, FediversePost>
*/
public function fetchPostsSince(Instance $instance, ?string $lastSeenId): Collection;
}

View file

@ -0,0 +1,43 @@
<?php
declare(strict_types=1);
namespace Lvl0\FediDiscover\Clients;
use Illuminate\Support\Collection;
use Illuminate\Support\Facades\Http;
use Lvl0\FediDiscover\Models\Instance;
use Lvl0\FediDiscover\ValueObjects\FediversePost;
class LemmyClient implements FediverseClientInterface
{
public function fetchPostsSince(Instance $instance, ?string $lastSeenId): Collection
{
$url = 'https://' . parse_url($instance->url, PHP_URL_HOST) . '/api/v3/post/list';
$params = $lastSeenId !== null ? ['min_id' => $lastSeenId] : [];
$response = Http::withHeaders([
'User-Agent' => config('fedi-discover.http.user_agent'),
])->timeout(config('fedi-discover.http.timeout'))->get($url, $params);
if (! $response->successful()) {
return collect();
}
return collect($response->json('posts', []))
->map(fn (array $p) => $p['post'])
->map(function (array $t) {
$parts = array_filter([$t['body'] ?? null, $t['url'] ?? null]);
$body = $parts ? implode(' ', $parts) : null;
return new FediversePost(
cursorId: (string) $t['id'],
selfUrl: $t['ap_id'],
body: $body,
title: $t['name'],
publishedAt: $t['published']
);
});
}
}

View file

@ -0,0 +1,36 @@
<?php
declare(strict_types=1);
namespace Lvl0\FediDiscover\Clients;
use Illuminate\Support\Collection;
use Illuminate\Support\Facades\Http;
use Lvl0\FediDiscover\Models\Instance;
use Lvl0\FediDiscover\ValueObjects\FediversePost;
class MastodonClient implements FediverseClientInterface
{
public function fetchPostsSince(Instance $instance, ?string $lastSeenId): Collection
{
$url = 'https://' . parse_url($instance->url, PHP_URL_HOST) . '/api/v1/timelines/public';
$params = $lastSeenId !== null ? ['min_id' => $lastSeenId] : [];
$response = Http::withHeaders([
'User-Agent' => config('fedi-discover.http.user_agent'),
])->timeout(config('fedi-discover.http.timeout'))->get($url, $params);
if (! $response->successful()) {
return collect();
}
return collect($response->json() ?? [])
->map(fn (array $t) => new FediversePost(
cursorId: $t['id'],
selfUrl: $t['url'] ?? $t['uri'] ?? null,
body: $t['content'],
publishedAt: $t['created_at'] ?? null
));
}
}

View file

@ -0,0 +1,65 @@
<?php
declare(strict_types=1);
namespace Lvl0\FediDiscover\Config;
use InvalidArgumentException;
final readonly class InstanceConfig
{
/**
* @param array<string, mixed> $extras
*/
public function __construct(
public InstanceType $type,
public string $url,
public bool $enabled,
public int $intervalSeconds,
public array $extras
) {}
/**
* @throws InvalidArgumentException
*/
public static function fromArray(array $array): self
{
foreach (['type', 'url', 'enabled', 'interval_seconds'] as $key) {
if (! array_key_exists($key, $array)) {
throw new InvalidArgumentException("Missing required key: {$key}");
}
}
if ($array['interval_seconds'] <= 0) {
throw new InvalidArgumentException('Interval seconds needs to be larger than zero');
}
$type = InstanceType::tryFrom($array['type']);
if ($type === null) {
throw new InvalidArgumentException('Invalid type: ' . $array['type']);
}
if (filter_var($array['url'], FILTER_VALIDATE_URL) === false) {
throw new InvalidArgumentException('Invalid URL: ' . $array['url']);
}
return new self(
type: $type,
url: $array['url'],
enabled: $array['enabled'],
intervalSeconds: $array['interval_seconds'],
extras: $array['extras'] ?? []
);
}
public function toArray(): array
{
return [
'type' => $this->type->value,
'url' => $this->url,
'enabled' => $this->enabled,
'interval_seconds' => $this->intervalSeconds,
'extras' => $this->extras,
];
}
}

View file

@ -0,0 +1,11 @@
<?php
declare(strict_types=1);
namespace Lvl0\FediDiscover\Config;
enum InstanceType: string
{
case Mastodon = 'mastodon';
case Lemmy = 'lemmy';
}

View file

@ -0,0 +1,61 @@
<?php
declare(strict_types=1);
namespace Lvl0\FediDiscover\Console\Commands;
use Illuminate\Console\Attributes\Description;
use Illuminate\Console\Attributes\Signature;
use Illuminate\Console\Command;
use Illuminate\Support\Facades\Log;
use Lvl0\FediDiscover\Actions\PollFediverseAction;
use Lvl0\FediDiscover\Events\PollFailed;
use Lvl0\FediDiscover\Models\Instance;
use Throwable;
#[Signature('fedi-discover:poll')]
#[Description('Poll all enabled fediverse instances for new URLs')]
class PollInstancesCommand extends Command
{
public function __construct(
private readonly PollFediverseAction $action
) {
parent::__construct();
}
public function handle(): int
{
$errors = Instance::enabled()
->get()
->map(function (Instance $instance) {
try {
$this->action->execute($instance);
return ['instance_id' => $instance->id, 'status' => 'success'];
} catch (Throwable $e) {
$this->error("Failed to poll {$instance->url}: {$e->getMessage()}");
Log::warning('fedi-discover:poll failed', [
'instance_id' => $instance->id,
'instance_url' => $instance->url,
'exception' => $e::class,
'message' => $e->getMessage(),
]);
return ['instance' => $instance, 'status' => 'error', 'error' => $e->getMessage()];
}
})
->filter(fn (array $res) => $res['status'] === 'error');
if ($errors->isEmpty()) {
return self::SUCCESS;
}
$errors->each(fn (array $errorArr) => PollFailed::dispatch(
$errorArr['instance'],
$errorArr['error'],
now()->toImmutable(),
));
return self::FAILURE;
}
}

View file

@ -0,0 +1,64 @@
<?php
declare(strict_types=1);
namespace Lvl0\FediDiscover\Console\Commands;
use Illuminate\Console\Attributes\Description;
use Illuminate\Console\Attributes\Signature;
use Illuminate\Console\Command;
use Lvl0\FediDiscover\Models\Instance;
#[Signature('fedi-discover:validate {--enabled-only}')]
#[Description('Validate saved instances')]
class ValidateInstancesCommand extends Command
{
public function handle(): int
{
$instances = Instance::query();
if ($this->option('enabled-only')) {
$instances->enabled();
}
$instances = $instances->get();
$invalidInstances = collect();
$instances->each(function (Instance $instance) use ($invalidInstances) {
$reasons = collect();
if (filter_var($instance->url, FILTER_VALIDATE_URL) === false) {
$reasons->add('Invalid URL: ' . $instance->url);
}
if ($instance->interval_seconds < 1) {
$reasons->add('Invalid interval seconds: ' . $instance->interval_seconds);
}
if ($reasons->isNotEmpty()) {
$invalidInstances->add([
'instance' => $instance,
'reasons' => $reasons,
]);
}
});
$this->info((string) $instances->count());
$this->info(($instances->count() - $invalidInstances->count()) . ' valid');
$this->line($invalidInstances->count() . ' invalid');
if ($invalidInstances->isNotEmpty()) {
$invalidInstances->each(function (array $instanceArray) {
$instance = $instanceArray['instance'];
$reason = $instanceArray['reasons']->join(', ');
$this->warn($instance->id . ' - ' . $instance->url);
$this->line(' : ' . $reason);
});
return self::FAILURE;
}
return self::SUCCESS;
}
}

View file

@ -0,0 +1,54 @@
<?php
declare(strict_types=1);
namespace Lvl0\FediDiscover\Database\Factories;
use Illuminate\Database\Eloquent\Factories\Factory;
use Lvl0\FediDiscover\Config\InstanceType;
use Lvl0\FediDiscover\Models\Instance;
/**
* @extends Factory<Instance>
*/
class InstanceFactory extends Factory
{
protected $model = Instance::class;
/**
* @return array<string, mixed>
*/
public function definition(): array
{
return [
'type' => null,
'url' => fake()->url,
'enabled' => null,
'interval_seconds' => 600,
'extras' => [],
'last_seen_id' => null,
'last_polled_at' => now(),
];
}
public function type(InstanceType $type): self
{
return $this->state(fn () => [
'type' => $type->value,
]);
}
public function enabled(): self
{
return $this->state(fn () => [
'enabled' => true,
]);
}
public function disabled(): self
{
return $this->state(fn () => [
'enabled' => false,
]);
}
}

View file

@ -0,0 +1,23 @@
<?php
declare(strict_types=1);
namespace Lvl0\FediDiscover\Events;
use Carbon\CarbonImmutable;
use Illuminate\Foundation\Events\Dispatchable;
use Illuminate\Queue\SerializesModels;
use Lvl0\FediDiscover\Models\Instance;
class PollFailed
{
use Dispatchable, SerializesModels;
public function __construct(
public Instance $instance,
public string $message,
public CarbonImmutable $failedAt,
) {
//
}
}

View file

@ -0,0 +1,22 @@
<?php
declare(strict_types=1);
namespace Lvl0\FediDiscover\Events;
use Carbon\CarbonImmutable;
use Illuminate\Foundation\Events\Dispatchable;
use Illuminate\Queue\SerializesModels;
class UrlDiscovered
{
use Dispatchable, SerializesModels;
public function __construct(
public string $url,
public int $instanceId,
public CarbonImmutable $discoveredAt,
public ?string $postUrl = null,
public ?string $postBody = null,
) {}
}

View file

@ -5,20 +5,32 @@
namespace Lvl0\FediDiscover;
use Illuminate\Support\ServiceProvider;
use Lvl0\FediDiscover\Clients\FediverseClientFactory;
use Lvl0\FediDiscover\Console\Commands\PollInstancesCommand;
use Lvl0\FediDiscover\Console\Commands\ValidateInstancesCommand;
class FediDiscoverServiceProvider extends ServiceProvider
{
public function register(): void
{
$this->mergeConfigFrom(__DIR__.'/../config/fedi-discover.php', 'fedi-discover');
$this->mergeConfigFrom(__DIR__ . '/../config/fedi-discover.php', 'fedi-discover');
$this->app->singleton(FediverseClientFactory::class);
}
public function boot(): void
{
$this->loadMigrationsFrom(__DIR__ . '/../database/migrations');
if ($this->app->runningInConsole()) {
$this->publishes([
__DIR__.'/../config/fedi-discover.php' => config_path('fedi-discover.php'),
__DIR__ . '/../config/fedi-discover.php' => config_path('fedi-discover.php'),
], 'fedi-discover-config');
$this->commands([
PollInstancesCommand::class,
ValidateInstancesCommand::class,
]);
}
}
}

View file

@ -0,0 +1,64 @@
<?php
declare(strict_types=1);
namespace Lvl0\FediDiscover\Models;
use App\Models\Page;
use Illuminate\Database\Eloquent\Builder;
use Illuminate\Database\Eloquent\Factories\Factory;
use Illuminate\Database\Eloquent\Factories\HasFactory;
use Illuminate\Database\Eloquent\Model;
use Illuminate\Database\Eloquent\Relations\HasMany;
use Illuminate\Support\Carbon;
use Lvl0\FediDiscover\Config\InstanceType;
use Lvl0\FediDiscover\Database\Factories\InstanceFactory;
/**
* @property int $id
* @property InstanceType $type
* @property string $url
* @property bool $enabled
* @property int $interval_seconds
* @property array<string, mixed> $extras
* @property string|null $last_seen_id
* @property int $consecutive_poll_failures
* @property Carbon|null $last_polled_at
* @property Carbon $created_at
* @property Carbon $updated_at
*/
class Instance extends Model
{
/** @use HasFactory<InstanceFactory> */
use HasFactory;
protected $table = 'fedi_discover_instances';
protected $fillable = ['type', 'url', 'enabled', 'interval_seconds', 'extras', 'last_seen_id', 'last_polled_at', 'consecutive_poll_failures'];
protected $casts = [
'type' => InstanceType::class,
'enabled' => 'boolean',
'extras' => 'array',
'last_polled_at' => 'datetime',
];
/**
* @param Builder<self> $query
* @return Builder<self>
*/
public function scopeEnabled(Builder $query): Builder
{
return $query->where('enabled', true);
}
protected static function newFactory(): Factory
{
return InstanceFactory::new();
}
public function pages(): HasMany
{
return $this->hasMany(Page::class);
}
}

View file

@ -0,0 +1,16 @@
<?php
declare(strict_types=1);
namespace Lvl0\FediDiscover\ValueObjects;
class FediversePost
{
public function __construct(
public string $cursorId,
public ?string $selfUrl,
public ?string $body = null,
public ?string $title = null,
public ?string $publishedAt = null,
) {}
}

View file

@ -0,0 +1,45 @@
<?php
declare(strict_types=1);
namespace Lvl0\FediDiscover\Tests\Feature;
use Lvl0\FediDiscover\Clients\FediverseClientFactory;
use Lvl0\FediDiscover\Clients\LemmyClient;
use Lvl0\FediDiscover\Clients\MastodonClient;
use Lvl0\FediDiscover\Config\InstanceType;
use Lvl0\FediDiscover\Models\Instance;
use Tests\TestCase;
class FediverseClientFactoryTest extends TestCase
{
public function test_it_resolves_mastodon_client_for_mastodon_instance_type(): void
{
$factory = app(FediverseClientFactory::class);
$instance = new Instance(['type' => InstanceType::Mastodon, 'url' => 'https://mastodon.social']);
$client = $factory->for($instance);
$this->assertInstanceOf(MastodonClient::class, $client);
}
public function test_it_resolves_lemmy_client_for_lemmy_instance_type(): void
{
$factory = app(FediverseClientFactory::class);
$instance = new Instance(['type' => InstanceType::Lemmy, 'url' => 'https://lemmy.world']);
$client = $factory->for($instance);
$this->assertInstanceOf(LemmyClient::class, $client);
}
public function test_it_is_registered_as_a_singleton_in_the_container(): void
{
$a = $this->app->make(FediverseClientFactory::class);
$b = $this->app->make(FediverseClientFactory::class);
$this->assertSame($a, $b);
}
}

View file

@ -0,0 +1,57 @@
<?php
declare(strict_types=1);
namespace Lvl0\FediDiscover\Tests\Feature;
use Illuminate\Foundation\Testing\RefreshDatabase;
use Lvl0\FediDiscover\Config\InstanceConfig;
use Lvl0\FediDiscover\Config\InstanceType;
use Lvl0\FediDiscover\Models\Instance;
use Tests\TestCase;
class InstanceConfigPersistenceTest extends TestCase
{
use RefreshDatabase;
public function test_instance_config_to_array_is_mass_assignable_on_the_model(): void
{
$config = InstanceConfig::fromArray([
'type' => InstanceType::Mastodon->value,
'url' => 'https://mastodon.social',
'enabled' => true,
'interval_seconds' => 600,
'extras' => ['token' => 'abc123'],
]);
Instance::create($config->toArray());
$this->artisan('fedi-discover:validate')
->assertExitCode(0);
}
public function test_an_instance_config_survives_a_write_read_cycle_through_the_model(): void
{
$original = InstanceConfig::fromArray([
'type' => InstanceType::Mastodon->value,
'url' => 'https://hachyderm.io',
'enabled' => false,
'interval_seconds' => 900,
'extras' => ['foo' => 'bar'],
]);
Instance::create($original->toArray());
$instance = Instance::query()->firstOrFail();
$roundTripped = InstanceConfig::fromArray([
'type' => $instance->type->value,
'url' => $instance->url,
'enabled' => $instance->enabled,
'interval_seconds' => $instance->interval_seconds,
'extras' => $instance->extras,
]);
$this->assertEquals($original, $roundTripped);
}
}

View file

@ -0,0 +1,113 @@
<?php
declare(strict_types=1);
namespace Lvl0\FediDiscover\Tests\Feature;
use Illuminate\Foundation\Testing\RefreshDatabase;
use Illuminate\Support\Carbon;
use Lvl0\FediDiscover\Config\InstanceType;
use Lvl0\FediDiscover\Models\Instance;
use Tests\TestCase;
class InstanceModelTest extends TestCase
{
use RefreshDatabase;
public function test_it_persists_and_retrieves_an_instance(): void
{
Instance::create([
'type' => InstanceType::Mastodon,
'url' => 'https://mastodon.social',
'enabled' => true,
'interval_seconds' => 600,
'extras' => ['token' => 'abc123'],
]);
$instance = Instance::first();
$this->assertNotNull($instance);
$this->assertSame(InstanceType::Mastodon, $instance->type);
$this->assertSame('https://mastodon.social', $instance->url);
$this->assertTrue($instance->enabled);
$this->assertSame(600, $instance->interval_seconds);
$this->assertSame(['token' => 'abc123'], $instance->extras);
}
public function test_enabled_is_fillable_and_cast_to_boolean(): void
{
$instance = Instance::create([
'type' => InstanceType::Mastodon,
'url' => 'https://mastodon.social',
'enabled' => false,
'interval_seconds' => 600,
]);
$this->assertFalse($instance->fresh()->enabled);
}
public function test_last_polled_at_is_fillable_and_cast_to_datetime(): void
{
$polledAt = Carbon::parse('2026-04-23 12:00:00');
$instance = Instance::create([
'type' => InstanceType::Mastodon,
'url' => 'https://mastodon.social',
'enabled' => true,
'interval_seconds' => 600,
'last_polled_at' => $polledAt,
]);
$fresh = $instance->fresh();
$this->assertInstanceOf(Carbon::class, $fresh->last_polled_at);
$this->assertTrue($fresh->last_polled_at->equalTo($polledAt));
}
public function test_last_seen_id_defaults_to_null(): void
{
$instance = Instance::create([
'type' => InstanceType::Mastodon,
'url' => 'https://mastodon.social',
'enabled' => true,
'interval_seconds' => 600,
]);
$this->assertNull($instance->fresh()->last_seen_id);
}
public function test_last_seen_id_is_fillable_and_persists_as_string(): void
{
$instance = Instance::create([
'type' => InstanceType::Mastodon,
'url' => 'https://mastodon.social',
'enabled' => true,
'interval_seconds' => 600,
'last_seen_id' => '109876543210',
]);
$this->assertSame('109876543210', $instance->fresh()->last_seen_id);
}
public function test_enabled_scope_returns_only_enabled_instances(): void
{
Instance::create([
'type' => InstanceType::Mastodon,
'url' => 'https://enabled.example',
'enabled' => true,
'interval_seconds' => 600,
]);
Instance::create([
'type' => InstanceType::Mastodon,
'url' => 'https://disabled.example',
'enabled' => false,
'interval_seconds' => 600,
]);
$enabled = Instance::enabled()->get();
$this->assertCount(1, $enabled);
$this->assertSame('https://enabled.example', $enabled->first()->url);
}
}

View file

@ -0,0 +1,150 @@
<?php
declare(strict_types=1);
namespace Lvl0\FediDiscover\Tests\Feature;
use Illuminate\Support\Facades\Http;
use Lvl0\FediDiscover\Clients\LemmyClient;
use Lvl0\FediDiscover\Config\InstanceType;
use Lvl0\FediDiscover\Models\Instance;
use Lvl0\FediDiscover\ValueObjects\FediversePost;
use Tests\TestCase;
class LemmyClientTest extends TestCase
{
public function test_it_maps_each_post_to_a_fediverse_post(): void
{
Http::fake([
'*' => Http::response([
'posts' => [
$this->lemmyPost(
id: 42,
apId: 'https://lemmy.world/post/42',
name: 'My Great Post',
body: 'Some body text',
published: '2026-04-25T10:00:00.000000',
),
],
], 200),
]);
$posts = (new LemmyClient)->fetchPostsSince($this->lemmyInstance(), null);
$this->assertCount(1, $posts);
$this->assertInstanceOf(FediversePost::class, $posts->first());
$this->assertSame('42', $posts->first()->cursorId);
$this->assertSame('https://lemmy.world/post/42', $posts->first()->selfUrl);
$this->assertSame('My Great Post', $posts->first()->title);
$this->assertSame('Some body text', $posts->first()->body);
$this->assertSame('2026-04-25T10:00:00.000000', $posts->first()->publishedAt);
}
public function test_url_field_is_appended_to_body(): void
{
Http::fake([
'*' => Http::response([
'posts' => [
$this->lemmyPost(
id: 42,
apId: 'https://lemmy.world/post/42',
url: 'https://example-garden.blog/post-42',
body: 'Some original text.',
),
],
], 200),
]);
$post = (new LemmyClient)->fetchPostsSince($this->lemmyInstance(), null)->first();
$this->assertStringContainsString('Some original text.', $post->body);
$this->assertStringContainsString('https://example-garden.blog/post-42', $post->body);
}
public function test_url_absent_leaves_body_clean(): void
{
Http::fake([
'*' => Http::response([
'posts' => [
$this->lemmyPost(
id: 7,
apId: 'https://lemmy.world/post/7',
body: 'Just a regular post.',
),
],
], 200),
]);
$post = (new LemmyClient)->fetchPostsSince($this->lemmyInstance(), null)->first();
$this->assertSame('Just a regular post.', $post->body);
}
public function test_it_handles_posts_without_a_body_key(): void
{
Http::fake([
'*' => Http::response([
'posts' => [
[
'post' => [
'id' => 99,
'ap_id' => 'https://lemmy.world/post/99',
'url' => null,
'name' => 'Link-only post',
'published' => '2026-04-25T10:00:00.000000',
// 'body' key intentionally absent — real Lemmy API omits it for link-only posts
],
],
],
], 200),
]);
$post = (new LemmyClient)->fetchPostsSince($this->lemmyInstance(), null)->first();
$this->assertNull($post->body);
}
public function test_it_hits_the_post_list_endpoint_of_the_instance(): void
{
Http::fake([
'lemmy.world/api/v3/post/list*' => Http::response(['posts' => []], 200),
]);
(new LemmyClient)->fetchPostsSince($this->lemmyInstance(), null);
Http::assertSent(fn ($request) => str_starts_with($request->url(), 'https://lemmy.world/api/v3/post/list')
&& $request->method() === 'GET'
);
}
private function lemmyInstance(): Instance
{
return new Instance([
'type' => InstanceType::Lemmy,
'url' => 'https://lemmy.world',
]);
}
/**
* @return array<string, mixed>
*/
private function lemmyPost(
int $id,
string $apId,
?string $url = null,
string $body = '',
string $name = 'A post title',
string $published = '2026-04-25T10:00:00.000000',
): array {
return [
'post' => [
'id' => $id,
'ap_id' => $apId,
'url' => $url,
'body' => $body,
'name' => $name,
'published' => $published,
],
];
}
}

View file

@ -0,0 +1,191 @@
<?php
declare(strict_types=1);
namespace Lvl0\FediDiscover\Tests\Feature;
use Illuminate\Support\Collection;
use Illuminate\Support\Facades\Http;
use Lvl0\FediDiscover\Clients\MastodonClient;
use Lvl0\FediDiscover\Config\InstanceType;
use Lvl0\FediDiscover\Models\Instance;
use Lvl0\FediDiscover\ValueObjects\FediversePost;
use Tests\TestCase;
class MastodonClientTest extends TestCase
{
public function test_it_hits_the_public_timeline_endpoint_of_the_instance(): void
{
Http::fake([
'mastodon.social/api/v1/timelines/public*' => Http::response([], 200),
]);
(new MastodonClient)->fetchPostsSince($this->mastodonInstance(), null);
Http::assertSent(fn ($request) => str_starts_with($request->url(), 'https://mastodon.social/api/v1/timelines/public')
&& $request->method() === 'GET'
);
}
public function test_it_omits_min_id_on_first_poll(): void
{
Http::fake(['*' => Http::response([], 200)]);
(new MastodonClient)->fetchPostsSince($this->mastodonInstance(), null);
Http::assertSent(fn ($request) => ! str_contains($request->url(), 'min_id'));
}
public function test_it_passes_min_id_on_subsequent_polls(): void
{
Http::fake(['*' => Http::response([], 200)]);
(new MastodonClient)->fetchPostsSince($this->mastodonInstance(), '109876543210');
Http::assertSent(fn ($request) => str_contains($request->url(), 'min_id=109876543210'));
}
public function test_it_returns_an_empty_collection_when_the_api_returns_no_posts(): void
{
Http::fake(['*' => Http::response([], 200)]);
$posts = (new MastodonClient)->fetchPostsSince($this->mastodonInstance(), null);
$this->assertInstanceOf(Collection::class, $posts);
$this->assertTrue($posts->isEmpty());
}
public function test_it_maps_each_status_to_a_fediverse_post(): void
{
Http::fake([
'*' => Http::response([
$this->mastodonStatus(id: '109876543210', url: 'https://mastodon.social/@alice/109876543210', content: '<p>Hello</p>'),
$this->mastodonStatus(id: '109876543211', url: 'https://mastodon.social/@bob/109876543211', content: '<p>World</p>'),
], 200),
]);
$posts = (new MastodonClient)->fetchPostsSince($this->mastodonInstance(), null);
$this->assertCount(2, $posts);
$this->assertInstanceOf(FediversePost::class, $posts->first());
$this->assertSame('109876543210', $posts->first()->cursorId);
$this->assertSame('https://mastodon.social/@alice/109876543210', $posts->first()->selfUrl);
$this->assertSame('<p>Hello</p>', $posts->first()->body);
}
public function test_it_maps_published_at_from_created_at(): void
{
Http::fake([
'*' => Http::response([
$this->mastodonStatus(id: '109876543210', url: 'https://mastodon.social/@alice/109876543210'),
], 200),
]);
$posts = (new MastodonClient)->fetchPostsSince($this->mastodonInstance(), null);
$this->assertSame('2026-04-25T10:00:00Z', $posts->first()->publishedAt);
}
public function test_it_sets_title_to_null_for_mastodon_statuses(): void
{
Http::fake([
'*' => Http::response([
$this->mastodonStatus(id: '109876543210', url: 'https://mastodon.social/@alice/109876543210'),
], 200),
]);
$posts = (new MastodonClient)->fetchPostsSince($this->mastodonInstance(), null);
$this->assertNull($posts->first()->title);
}
public function test_it_falls_back_to_uri_when_url_is_null(): void
{
Http::fake([
'*' => Http::response([
$this->mastodonStatus(
id: '109876543210',
url: null,
uri: 'https://hachyderm.io/users/bob/statuses/5678',
content: '<p>federated post</p>'
),
], 200),
]);
$posts = (new MastodonClient)->fetchPostsSince($this->mastodonInstance(), null);
$this->assertSame('https://hachyderm.io/users/bob/statuses/5678', $posts->first()->selfUrl);
}
public function test_it_preserves_newest_first_ordering_from_the_api(): void
{
Http::fake([
'*' => Http::response([
$this->mastodonStatus(id: '300', url: 'https://mastodon.social/@a/300'),
$this->mastodonStatus(id: '200', url: 'https://mastodon.social/@b/200'),
$this->mastodonStatus(id: '100', url: 'https://mastodon.social/@c/100'),
], 200),
]);
$posts = (new MastodonClient)->fetchPostsSince($this->mastodonInstance(), null);
$this->assertSame(['300', '200', '100'], $posts->pluck('cursorId')->all());
}
public function test_it_returns_an_empty_collection_on_a_non_2xx_response(): void
{
Http::fake(['*' => Http::response('Too many requests', 429)]);
$posts = (new MastodonClient)->fetchPostsSince($this->mastodonInstance(), null);
$this->assertInstanceOf(Collection::class, $posts);
$this->assertTrue($posts->isEmpty());
}
public function test_it_returns_an_empty_collection_when_the_response_body_is_not_json(): void
{
Http::fake(['*' => Http::response('<html>error</html>', 200)]);
$posts = (new MastodonClient)->fetchPostsSince($this->mastodonInstance(), null);
$this->assertInstanceOf(Collection::class, $posts);
$this->assertTrue($posts->isEmpty());
}
public function test_it_sends_the_configured_user_agent(): void
{
Http::fake(['*' => Http::response([], 200)]);
(new MastodonClient)->fetchPostsSince($this->mastodonInstance(), null);
$expected = config('fedi-discover.http.user_agent');
Http::assertSent(fn ($request) => $request->header('User-Agent')[0] === $expected);
}
private function mastodonInstance(): Instance
{
return new Instance([
'type' => InstanceType::Mastodon,
'url' => 'https://mastodon.social',
]);
}
/**
* @return array<string, mixed>
*/
private function mastodonStatus(
string $id,
?string $url = null,
?string $uri = null,
string $content = '<p>example</p>',
): array {
return [
'id' => $id,
'url' => $url,
'uri' => $uri ?? "https://mastodon.social/users/x/statuses/{$id}",
'content' => $content,
'created_at' => '2026-04-25T10:00:00Z',
'account' => ['acct' => 'alice@mastodon.social'],
];
}
}

View file

@ -0,0 +1,268 @@
<?php
declare(strict_types=1);
namespace Lvl0\FediDiscover\Tests\Feature;
use Carbon\CarbonImmutable;
use Illuminate\Foundation\Testing\RefreshDatabase;
use Illuminate\Support\Facades\Event;
use Illuminate\Support\Facades\Log;
use Lvl0\FediDiscover\Actions\PollFediverseAction;
use Lvl0\FediDiscover\Clients\FediverseClientFactory;
use Lvl0\FediDiscover\Clients\FediverseClientInterface;
use Lvl0\FediDiscover\Config\InstanceType;
use Lvl0\FediDiscover\Events\UrlDiscovered;
use Lvl0\FediDiscover\Models\Instance;
use Lvl0\FediDiscover\ValueObjects\FediversePost;
use Mockery;
use Tests\TestCase;
class PollFediverseActionTest extends TestCase
{
use RefreshDatabase;
public function test_it_fires_one_event_per_extracted_url(): void
{
Event::fake([UrlDiscovered::class]);
$this->poll([
new FediversePost('1', 'https://mastodon.social/@alice/1', 'See https://example.com/one and https://other.example/two'),
]);
Event::assertDispatched(UrlDiscovered::class, fn (UrlDiscovered $e) => $e->url === 'https://example.com/one');
Event::assertDispatched(UrlDiscovered::class, fn (UrlDiscovered $e) => $e->url === 'https://other.example/two');
Event::assertDispatchedTimes(UrlDiscovered::class, 2);
}
public function test_it_extracts_urls_from_html_anchor_tags(): void
{
Event::fake([UrlDiscovered::class]);
$this->poll([
new FediversePost('1', 'https://mastodon.social/@alice/1', '<p>Check <a href="https://example.com/article">this</a>!</p>'),
]);
Event::assertDispatched(UrlDiscovered::class, fn (UrlDiscovered $e) => $e->url === 'https://example.com/article');
Event::assertDispatchedTimes(UrlDiscovered::class, 1);
}
public function test_it_extracts_urls_from_markdown_links(): void
{
Event::fake([UrlDiscovered::class]);
$this->poll(
posts: [new FediversePost('1', 'https://lemmy.world/post/42', 'A [great article](https://example.com/article) about trees.')],
instanceUrl: 'https://lemmy.world',
);
Event::assertDispatched(UrlDiscovered::class, fn (UrlDiscovered $e) => $e->url === 'https://example.com/article');
Event::assertDispatchedTimes(UrlDiscovered::class, 1);
}
public function test_it_strips_trailing_punctuation_from_urls(): void
{
Event::fake([UrlDiscovered::class]);
$this->poll([
new FediversePost('1', 'https://mastodon.social/@alice/1', 'Check https://example.com/article, it is great. Also https://other.example/page.'),
]);
Event::assertDispatched(UrlDiscovered::class, fn (UrlDiscovered $e) => $e->url === 'https://example.com/article');
Event::assertDispatched(UrlDiscovered::class, fn (UrlDiscovered $e) => $e->url === 'https://other.example/page');
}
public function test_it_deduplicates_urls_within_a_single_post(): void
{
Event::fake([UrlDiscovered::class]);
$this->poll([
new FediversePost('1', 'https://mastodon.social/@alice/1', 'Here is https://example.com/article and again https://example.com/article'),
]);
Event::assertDispatchedTimes(UrlDiscovered::class, 1);
}
public function test_it_filters_urls_on_the_polling_instance_host(): void
{
Event::fake([UrlDiscovered::class]);
$this->poll([
new FediversePost('1', 'https://mastodon.social/@alice/1', 'See https://mastodon.social/@bob/42 and https://example.com/article'),
]);
Event::assertDispatched(UrlDiscovered::class, fn (UrlDiscovered $e) => $e->url === 'https://example.com/article');
Event::assertDispatchedTimes(UrlDiscovered::class, 1);
}
public function test_it_ignores_posts_with_a_null_body(): void
{
Event::fake([UrlDiscovered::class]);
$this->poll([
new FediversePost('1', 'https://mastodon.social/@alice/1', null),
]);
Event::assertNotDispatched(UrlDiscovered::class);
}
public function test_it_ignores_non_http_schemes(): void
{
Event::fake([UrlDiscovered::class]);
$this->poll([
new FediversePost('1', 'https://mastodon.social/@alice/1', 'Email mailto:alice@example.com or try ftp://files.example.com/x'),
]);
Event::assertNotDispatched(UrlDiscovered::class);
}
public function test_it_passes_post_self_url_and_body_through_to_the_event(): void
{
Event::fake([UrlDiscovered::class]);
$instance = $this->makeInstance();
$body = 'Here is https://example.com/article with surrounding context.';
$this->pollInstance($instance, [
new FediversePost('1', 'https://mastodon.social/@alice/1', $body),
]);
Event::assertDispatched(UrlDiscovered::class, fn (UrlDiscovered $e) => $e->postUrl === 'https://mastodon.social/@alice/1'
&& $e->postBody === $body
&& $e->instanceId === $instance->id
&& $e->discoveredAt instanceof CarbonImmutable
);
}
public function test_it_processes_multiple_posts(): void
{
Event::fake([UrlDiscovered::class]);
$this->poll([
new FediversePost('1', 'https://mastodon.social/@alice/1', 'See https://example.com/one'),
new FediversePost('2', 'https://mastodon.social/@bob/2', 'Also https://example.com/two'),
]);
Event::assertDispatchedTimes(UrlDiscovered::class, 2);
}
public function test_it_updates_last_seen_id_to_the_first_posts_cursor(): void
{
$instance = $this->makeInstance();
// Clients return newest-first; the action treats posts[0]
// as the new high-water mark without inspecting cursor values.
$this->pollInstance($instance, [
new FediversePost('newest-cursor', 'https://mastodon.social/@alice/3', 'x'),
new FediversePost('middle-cursor', 'https://mastodon.social/@bob/2', 'y'),
new FediversePost('oldest-cursor', 'https://mastodon.social/@carol/1', 'z'),
]);
$this->assertSame('newest-cursor', $instance->fresh()->last_seen_id);
}
public function test_it_updates_last_polled_at(): void
{
$instance = $this->makeInstance();
$this->assertNull($instance->last_polled_at);
$this->pollInstance($instance, [
new FediversePost('1', 'https://mastodon.social/@alice/1', 'x'),
]);
$this->assertNotNull($instance->fresh()->last_polled_at);
}
public function test_it_passes_the_existing_last_seen_id_to_the_client(): void
{
$instance = $this->makeInstance(['last_seen_id' => '999']);
$client = Mockery::mock(FediverseClientInterface::class);
$client->shouldReceive('fetchPostsSince')
->once()
->with($instance, $instance->last_seen_id)
->andReturn(collect());
$factory = Mockery::mock(FediverseClientFactory::class);
$factory->shouldReceive('for')->with($instance)->andReturn($client);
(new PollFediverseAction($factory))->execute($instance);
}
public function test_it_leaves_last_seen_id_unchanged_when_no_posts_are_returned(): void
{
$instance = $this->makeInstance(['last_seen_id' => '500']);
$this->pollInstance($instance, []);
$this->assertSame('500', $instance->fresh()->last_seen_id);
}
public function test_consecutive_poll_failures_reset_to_zero_after_successful_poll(): void
{
$instance = $this->makeInstance(['consecutive_poll_failures' => 5]);
$this->pollInstance($instance, []);
$this->assertSame(0, $instance->fresh()->consecutive_poll_failures);
}
public function test_poll_logs_a_structured_success_entry_with_url_count_and_duration(): void
{
Log::spy();
Event::fake([UrlDiscovered::class]);
$instance = $this->makeInstance();
$this->pollInstance($instance, [
new FediversePost('1', 'https://mastodon.social/@alice/1', 'See https://example.com/one and https://other.example/two'),
new FediversePost('2', 'https://mastodon.social/@bob/2', 'Also https://example.com/three'),
]);
Log::shouldHaveReceived('info')
->once()
->withArgs(function (string $message, array $context) use ($instance): bool {
return $message === 'fedi-discover:poll succeeded'
&& $context['instance_id'] === $instance->id
&& $context['url_count'] === 3
&& isset($context['duration_ms'])
&& $context['duration_ms'] >= 0;
});
}
/**
* @param array<FediversePost> $posts
*/
private function poll(array $posts, string $instanceUrl = 'https://mastodon.social'): void
{
$this->pollInstance($this->makeInstance(['url' => $instanceUrl]), $posts);
}
/**
* @param array<FediversePost> $posts
*/
private function pollInstance(Instance $instance, array $posts): void
{
$client = Mockery::mock(FediverseClientInterface::class);
$client->shouldReceive('fetchPostsSince')->andReturn(collect($posts));
$factory = Mockery::mock(FediverseClientFactory::class);
$factory->shouldReceive('for')->andReturn($client);
(new PollFediverseAction($factory))->execute($instance);
}
/**
* @param array<string, mixed> $overrides
*/
private function makeInstance(array $overrides = []): Instance
{
return Instance::create(array_merge([
'type' => InstanceType::Mastodon,
'url' => 'https://mastodon.social',
'enabled' => true,
'interval_seconds' => 600,
], $overrides));
}
}

View file

@ -0,0 +1,202 @@
<?php
declare(strict_types=1);
namespace Lvl0\FediDiscover\Tests\Feature;
use Illuminate\Foundation\Testing\RefreshDatabase;
use Illuminate\Support\Facades\Event;
use Lvl0\FediDiscover\Actions\PollFediverseAction;
use Lvl0\FediDiscover\Clients\FediverseClientFactory;
use Lvl0\FediDiscover\Clients\FediverseClientInterface;
use Lvl0\FediDiscover\Config\InstanceType;
use Lvl0\FediDiscover\Events\PollFailed;
use Lvl0\FediDiscover\Models\Instance;
use Mockery;
use RuntimeException;
use Tests\TestCase;
class PollInstancesCommandTest extends TestCase
{
use RefreshDatabase;
protected function setUp(): void
{
parent::setUp();
// Bind a no-op factory stub so the command can resolve PollFediverseAction
// from the container without making real HTTP calls.
$clientStub = Mockery::mock(FediverseClientInterface::class);
$clientStub->shouldReceive('fetchPostsSince')->andReturn(collect());
$factoryStub = Mockery::mock(FediverseClientFactory::class);
$factoryStub->shouldReceive('for')->andReturn($clientStub);
$this->app->instance(FediverseClientFactory::class, $factoryStub);
}
public function test_it_exits_zero_when_there_are_no_enabled_instances(): void
{
$this->artisan('fedi-discover:poll')
->assertExitCode(0);
}
public function test_it_calls_the_action_for_each_enabled_instance_and_skips_disabled(): void
{
$enabled1 = Instance::create([
'type' => InstanceType::Mastodon,
'url' => 'https://mastodon.social',
'enabled' => true,
'interval_seconds' => 600,
]);
$enabled2 = Instance::create([
'type' => InstanceType::Mastodon,
'url' => 'https://fosstodon.org',
'enabled' => true,
'interval_seconds' => 600,
]);
Instance::create([
'type' => InstanceType::Mastodon,
'url' => 'https://disabled.example',
'enabled' => false,
'interval_seconds' => 600,
]);
$calledWith = [];
$action = Mockery::mock(PollFediverseAction::class);
$action->shouldReceive('execute')
->twice()
->withArgs(function (Instance $instance) use (&$calledWith): bool {
$calledWith[] = $instance->url;
return true;
});
$this->app->instance(PollFediverseAction::class, $action);
$this->artisan('fedi-discover:poll')->assertExitCode(0);
$this->assertEqualsCanonicalizing(
[$enabled1->url, $enabled2->url],
$calledWith,
);
}
public function test_one_instance_throwing_does_not_stop_remaining_instances_from_being_polled(): void
{
Instance::create([
'type' => InstanceType::Mastodon,
'url' => 'https://failing.example',
'enabled' => true,
'interval_seconds' => 600,
]);
$healthy = Instance::create([
'type' => InstanceType::Mastodon,
'url' => 'https://healthy.example',
'enabled' => true,
'interval_seconds' => 600,
]);
$calledWith = [];
$action = Mockery::mock(PollFediverseAction::class);
$action->shouldReceive('execute')
->twice()
->andReturnUsing(function (Instance $instance) use (&$calledWith): void {
$calledWith[] = $instance->url;
if ($instance->url === 'https://failing.example') {
throw new RuntimeException('Connection refused');
}
});
$this->app->instance(PollFediverseAction::class, $action);
$this->artisan('fedi-discover:poll')->assertExitCode(1);
$this->assertEqualsCanonicalizing(
['https://failing.example', $healthy->url],
$calledWith,
);
}
public function test_poll_failed_event_is_dispatched_when_action_throws(): void
{
Event::fake([PollFailed::class]);
$instance = Instance::create([
'type' => InstanceType::Mastodon,
'url' => 'https://failing.example',
'enabled' => true,
'interval_seconds' => 600,
]);
$action = Mockery::mock(PollFediverseAction::class);
$action->shouldReceive('execute')
->once()
->andReturnUsing(function (): void {
throw new RuntimeException('Connection refused');
});
$this->app->instance(PollFediverseAction::class, $action);
$this->artisan('fedi-discover:poll');
Event::assertDispatched(PollFailed::class, function (PollFailed $event) use ($instance): bool {
return $event->instance->id === $instance->id
&& $event->message === 'Connection refused';
});
}
public function test_poll_failed_event_is_not_dispatched_on_a_successful_poll(): void
{
Event::fake([PollFailed::class]);
Instance::create([
'type' => InstanceType::Mastodon,
'url' => 'https://healthy.example',
'enabled' => true,
'interval_seconds' => 600,
]);
// setUp() already binds a no-op action stub via the factory; no override needed.
$this->artisan('fedi-discover:poll');
Event::assertNotDispatched(PollFailed::class);
}
public function test_it_exits_one_when_at_least_one_instance_fails(): void
{
Instance::create([
'type' => InstanceType::Mastodon,
'url' => 'https://failing.example',
'enabled' => true,
'interval_seconds' => 600,
]);
Instance::create([
'type' => InstanceType::Mastodon,
'url' => 'https://healthy.example',
'enabled' => true,
'interval_seconds' => 600,
]);
$action = Mockery::mock(PollFediverseAction::class);
$action->shouldReceive('execute')
->twice()
->andReturnUsing(function (Instance $instance): void {
if ($instance->url === 'https://failing.example') {
throw new RuntimeException('Connection refused');
}
});
$this->app->instance(PollFediverseAction::class, $action);
$this->artisan('fedi-discover:poll')->assertExitCode(1);
}
}

View file

@ -0,0 +1,221 @@
<?php
declare(strict_types=1);
namespace Lvl0\FediDiscover\Tests\Feature;
use Illuminate\Foundation\Testing\RefreshDatabase;
use Lvl0\FediDiscover\Config\InstanceType;
use Lvl0\FediDiscover\Models\Instance;
use Tests\TestCase;
class ValidateInstancesCommandTest extends TestCase
{
use RefreshDatabase;
public function test_it_exits_zero_when_the_database_is_empty(): void
{
$this->artisan('fedi-discover:validate')
->assertExitCode(0);
}
public function test_it_exits_zero_when_all_instances_are_valid(): void
{
Instance::create([
'type' => InstanceType::Mastodon,
'url' => 'https://mastodon.social',
'enabled' => true,
'interval_seconds' => 600,
'extras' => [],
]);
$this->artisan('fedi-discover:validate')
->assertExitCode(0);
}
public function test_it_exits_nonzero_when_a_row_has_an_invalid_url(): void
{
Instance::create([
'type' => InstanceType::Mastodon,
'url' => 'not-a-url',
'enabled' => true,
'interval_seconds' => 600,
'extras' => [],
]);
$this->artisan('fedi-discover:validate')
->assertExitCode(1);
}
public function test_it_exits_nonzero_when_a_row_has_a_zero_interval(): void
{
Instance::create([
'type' => InstanceType::Mastodon,
'url' => 'https://mastodon.social',
'enabled' => true,
'interval_seconds' => 0,
'extras' => [],
]);
$this->artisan('fedi-discover:validate')
->assertExitCode(1);
}
public function test_it_reports_summary_of_valid_and_invalid_counts(): void
{
Instance::create([
'type' => InstanceType::Mastodon,
'url' => 'https://mastodon.social',
'enabled' => true,
'interval_seconds' => 600,
'extras' => [],
]);
Instance::create([
'type' => InstanceType::Mastodon,
'url' => 'https://hachyderm.io',
'enabled' => true,
'interval_seconds' => 600,
'extras' => [],
]);
Instance::create([
'type' => InstanceType::Mastodon,
'url' => 'bogus',
'enabled' => true,
'interval_seconds' => 600,
'extras' => [],
]);
$this->artisan('fedi-discover:validate')
->expectsOutputToContain('3')
->expectsOutputToContain('2 valid')
->expectsOutputToContain('1 invalid')
->assertExitCode(1);
}
public function test_it_does_not_fail_fast_and_reports_every_invalid_row(): void
{
Instance::create([
'type' => InstanceType::Mastodon,
'url' => 'bogus-one',
'enabled' => true,
'interval_seconds' => 600,
'extras' => [],
]);
$second = Instance::create([
'type' => InstanceType::Mastodon,
'url' => 'https://mastodon.social',
'enabled' => true,
'interval_seconds' => 0,
'extras' => [],
]);
$this->artisan('fedi-discover:validate')
->expectsOutputToContain('bogus-one')
->expectsOutputToContain((string) $second->id)
->assertExitCode(1);
}
public function test_it_includes_the_validation_error_message_for_each_invalid_row(): void
{
Instance::create([
'type' => InstanceType::Mastodon,
'url' => 'not-a-url',
'enabled' => true,
'interval_seconds' => 600,
'extras' => [],
]);
$this->artisan('fedi-discover:validate')
->expectsOutputToContain('Invalid URL: not-a-url')
->assertExitCode(1);
}
public function test_summary_counts_are_accurate_when_mixed(): void
{
// 2 valid
Instance::create([
'type' => InstanceType::Mastodon,
'url' => 'https://mastodon.social',
'enabled' => true,
'interval_seconds' => 600,
'extras' => [],
]);
Instance::create([
'type' => InstanceType::Mastodon,
'url' => 'https://hachyderm.io',
'enabled' => true,
'interval_seconds' => 600,
'extras' => [],
]);
// 3 invalid (different defects)
Instance::create([
'type' => InstanceType::Mastodon,
'url' => 'bogus-one',
'enabled' => true,
'interval_seconds' => 600,
'extras' => [],
]);
Instance::create([
'type' => InstanceType::Mastodon,
'url' => 'https://fosstodon.org',
'enabled' => true,
'interval_seconds' => 0,
'extras' => [],
]);
Instance::create([
'type' => InstanceType::Mastodon,
'url' => 'also-bad',
'enabled' => true,
'interval_seconds' => -5,
'extras' => [],
]);
$this->artisan('fedi-discover:validate')
->expectsOutputToContain('5')
->expectsOutputToContain('2 valid')
->expectsOutputToContain('3 invalid')
->assertExitCode(1);
}
public function test_it_exits_zero_with_enabled_only_when_no_enabled_instances_exist(): void
{
Instance::create([
'type' => InstanceType::Mastodon,
'url' => 'https://mastodon.social',
'enabled' => false,
'interval_seconds' => 600,
'extras' => [],
]);
$this->artisan('fedi-discover:validate', ['--enabled-only' => true])
->assertExitCode(0);
}
public function test_it_exits_zero_with_an_enabled_only_flag_when_disabled_rows_are_invalid(): void
{
// A disabled row that would fail InstanceConfig validation
Instance::create([
'type' => InstanceType::Mastodon,
'url' => 'broken-and-disabled',
'enabled' => false,
'interval_seconds' => 0,
'extras' => [],
]);
// A valid enabled row
Instance::create([
'type' => InstanceType::Mastodon,
'url' => 'https://mastodon.social',
'enabled' => true,
'interval_seconds' => 600,
'extras' => [],
]);
$this->artisan('fedi-discover:validate', ['--enabled-only' => true])
->assertExitCode(0);
}
}

View file

@ -0,0 +1,121 @@
<?php
declare(strict_types=1);
namespace Lvl0\FediDiscover\Tests\Unit;
use Lvl0\FediDiscover\Config\InstanceConfig;
use Lvl0\FediDiscover\Config\InstanceType;
use PHPUnit\Framework\Attributes\DataProvider;
use PHPUnit\Framework\TestCase;
class InstanceConfigTest extends TestCase
{
public function test_from_array_returns_instance_config_with_correct_field_values(): void
{
$config = InstanceConfig::fromArray([
'type' => 'mastodon',
'url' => 'https://mastodon.social',
'enabled' => true,
'interval_seconds' => 600,
'extras' => ['token' => 'abc123'],
]);
$this->assertSame(InstanceType::Mastodon, $config->type);
$this->assertSame('https://mastodon.social', $config->url);
$this->assertTrue($config->enabled);
$this->assertSame(600, $config->intervalSeconds);
$this->assertSame(['token' => 'abc123'], $config->extras);
}
public function test_from_array_rejects_non_positive_interval_seconds(): void
{
$this->expectException(\InvalidArgumentException::class);
InstanceConfig::fromArray([
'type' => 'mastodon',
'url' => 'https://mastodon.social',
'enabled' => true,
'interval_seconds' => 0,
'extras' => [],
]);
}
public function test_extras_defaults_to_empty_array_when_omitted(): void
{
$config = InstanceConfig::fromArray([
'type' => 'mastodon',
'url' => 'https://mastodon.social',
'enabled' => true,
'interval_seconds' => 600,
]);
$this->assertSame([], $config->extras);
}
#[DataProvider('requiredKeyProvider')]
public function test_from_array_throws_when_required_key_is_missing(string $missingKey): void
{
$input = [
'type' => 'mastodon',
'url' => 'https://mastodon.social',
'enabled' => true,
'interval_seconds' => 600,
];
unset($input[$missingKey]);
$this->expectException(\InvalidArgumentException::class);
$this->expectExceptionMessageMatches('/' . preg_quote($missingKey, '/') . '/');
InstanceConfig::fromArray($input);
}
public static function requiredKeyProvider(): array
{
return [
'type missing' => ['type'],
'url missing' => ['url'],
'enabled missing' => ['enabled'],
'interval_seconds missing' => ['interval_seconds'],
];
}
public function test_from_array_throws_invalid_argument_exception_for_unknown_type_string(): void
{
$this->expectException(\InvalidArgumentException::class);
$this->expectExceptionMessageMatches('/pleroma/');
InstanceConfig::fromArray([
'type' => 'pleroma',
'url' => 'https://pleroma.example.com',
'enabled' => true,
'interval_seconds' => 600,
]);
}
public function test_from_array_rejects_malformed_url(): void
{
$this->expectException(\InvalidArgumentException::class);
InstanceConfig::fromArray([
'type' => 'mastodon',
'url' => 'not a url',
'enabled' => true,
'interval_seconds' => 600,
]);
}
public function test_to_array_produces_array_that_round_trips_through_from_array(): void
{
$original = [
'type' => 'mastodon',
'url' => 'https://mastodon.social',
'enabled' => true,
'interval_seconds' => 600,
'extras' => ['token' => 'abc123'],
];
$this->assertSame($original, InstanceConfig::fromArray($original)->toArray());
}
}

View file

@ -0,0 +1,31 @@
<?php
declare(strict_types=1);
namespace Lvl0\FediDiscover\Tests\Unit;
use Carbon\CarbonImmutable;
use Lvl0\FediDiscover\Events\PollFailed;
use Lvl0\FediDiscover\Models\Instance;
use PHPUnit\Framework\TestCase;
class PollFailedTest extends TestCase
{
public function test_it_exposes_all_payload_fields(): void
{
$instance = new Instance;
$instance->id = 7;
$failedAt = CarbonImmutable::parse('2026-04-28T09:00:00');
$event = new PollFailed(
instance: $instance,
message: 'Connection timed out',
failedAt: $failedAt,
);
$this->assertSame($instance, $event->instance);
$this->assertSame('Connection timed out', $event->message);
$this->assertTrue($failedAt->eq($event->failedAt));
}
}

View file

@ -0,0 +1,44 @@
<?php
declare(strict_types=1);
namespace Lvl0\FediDiscover\Tests\Unit;
use Carbon\CarbonImmutable;
use Lvl0\FediDiscover\Events\UrlDiscovered;
use PHPUnit\Framework\TestCase;
class UrlDiscoveredTest extends TestCase
{
public function test_it_exposes_all_payload_fields(): void
{
$discoveredAt = CarbonImmutable::parse('2026-04-26T12:00:00');
$event = new UrlDiscovered(
url: 'https://example.com/article',
instanceId: 42,
discoveredAt: $discoveredAt,
postUrl: 'https://mastodon.social/@alice/109876543210',
postBody: 'Check out this article: https://example.com/article'
);
$this->assertSame('https://example.com/article', $event->url);
$this->assertSame(42, $event->instanceId);
$this->assertTrue($discoveredAt->eq($event->discoveredAt));
$this->assertSame('https://mastodon.social/@alice/109876543210', $event->postUrl);
$this->assertSame('Check out this article: https://example.com/article', $event->postBody);
}
public function test_post_body_is_nullable(): void
{
$event = new UrlDiscovered(
url: 'https://example.com/article',
instanceId: 1,
discoveredAt: CarbonImmutable::parse('2026-04-26T12:00:00'),
postUrl: 'https://mastodon.social/@alice/109876543210',
postBody: null
);
$this->assertNull($event->postBody);
}
}

View file

@ -3,6 +3,11 @@
xsi:noNamespaceSchemaLocation="vendor/phpunit/phpunit/phpunit.xsd"
bootstrap="vendor/autoload.php"
colors="true"
processIsolation="false"
displayDetailsOnPhpunitDeprecations="true"
displayDetailsOnTestsThatTriggerErrors="true"
displayDetailsOnTestsThatTriggerWarnings="true"
displayDetailsOnTestsThatTriggerNotices="true"
>
<testsuites>
<testsuite name="Unit">
@ -22,19 +27,21 @@
</include>
</source>
<php>
<env name="APP_ENV" value="testing"/>
<env name="APP_MAINTENANCE_DRIVER" value="file"/>
<env name="BCRYPT_ROUNDS" value="4"/>
<env name="BROADCAST_CONNECTION" value="null"/>
<env name="CACHE_STORE" value="array"/>
<env name="DB_CONNECTION" value="sqlite"/>
<env name="DB_DATABASE" value=":memory:"/>
<env name="DB_URL" value=""/>
<env name="MAIL_MAILER" value="array"/>
<env name="QUEUE_CONNECTION" value="sync"/>
<env name="SESSION_DRIVER" value="array"/>
<env name="PULSE_ENABLED" value="false"/>
<env name="TELESCOPE_ENABLED" value="false"/>
<env name="NIGHTWATCH_ENABLED" value="false"/>
<server name="APP_ENV" value="testing"/>
<server name="APP_MAINTENANCE_DRIVER" value="file"/>
<server name="BCRYPT_ROUNDS" value="4"/>
<server name="BROADCAST_CONNECTION" value="null"/>
<server name="CACHE_STORE" value="array"/>
<server name="DB_CONNECTION" value="sqlite"/>
<server name="DB_DATABASE" value=":memory:"/>
<server name="DB_URL" value=""/>
<server name="MAIL_MAILER" value="array"/>
<server name="QUEUE_CONNECTION" value="sync"/>
<server name="SESSION_DRIVER" value="array"/>
<server name="PULSE_ENABLED" value="false"/>
<server name="TELESCOPE_ENABLED" value="false"/>
<server name="NIGHTWATCH_ENABLED" value="false"/>
<ini name="display_errors" value="On"/>
<ini name="error_reporting" value="-1"/>
</php>
</phpunit>

8
pint.json Normal file
View file

@ -0,0 +1,8 @@
{
"preset": "laravel",
"rules": {
"concat_space": {
"spacing": "one"
}
}
}

View file

@ -6,15 +6,15 @@
define('LARAVEL_START', microtime(true));
// Determine if the application is in maintenance mode...
if (file_exists($maintenance = __DIR__.'/../storage/framework/maintenance.php')) {
if (file_exists($maintenance = __DIR__ . '/../storage/framework/maintenance.php')) {
require $maintenance;
}
// Register the Composer autoloader...
require __DIR__.'/../vendor/autoload.php';
require __DIR__ . '/../vendor/autoload.php';
// Bootstrap Laravel and handle the request...
/** @var Application $app */
$app = require_once __DIR__.'/../bootstrap/app.php';
$app = require_once __DIR__ . '/../bootstrap/app.php';
$app->handleRequest(Request::capture());

View file

@ -0,0 +1,29 @@
@extends('layouts.app')
@section('content')
<div>
<h1>Instances</h1>
<table>
<thead>
<tr>
<th>Instance</th>
<th>Last polled at</th>
<th>URLs</th>
<th>Errors</th>
</tr>
</thead>
<tbody>
@foreach($instances as $instance)
<tr>
<td>{{ $instance->url }}</td>
<td>{{ $instance->last_polled_at }}</td>
<td>{{ $instance->pages_count }} URLs</td>
<td>{{ $instance->failed_pages_count }} errors</td>
</tr>
@endforeach
</tbody>
</table>
</div>
@endsection

View file

@ -0,0 +1,63 @@
@extends('layouts.app')
@section('content')
<main>
<h1>About TroveBot</h1>
<p>
<strong>Trove</strong> is a federated search engine for the small web,
seeded by fediverse attention and ranked by domain coherence rather than
commercial authority. <strong>TroveBot</strong> is its crawler it
discovers and indexes URLs shared by people on the fediverse, then
follows the citations they make to find more of the small web.
</p>
<h2>Identity</h2>
<p>TroveBot identifies itself with the following User-Agent string:</p>
<pre><code>TroveBot/0.1 (+https://trove.lvl0.xyz/bot)</code></pre>
<h2>Crawling behavior</h2>
<ul>
<li>Respects <code>robots.txt</code> rules under <code>User-agent: TroveBot</code> (and the wildcard <code>User-agent: *</code> as a fallback).</li>
<li>Polite per-domain rate limit at most a few requests per minute per host.</li>
<li>Follows up to 5 redirects per URL.</li>
<li>Fetches HTML only. PDFs, images, and other binary content are recorded as discovered but never re-fetched.</li>
<li>Does not execute JavaScript, does not crawl behind authentication, does not crawl URLs containing user credentials.</li>
</ul>
<h2>Opt out</h2>
<p>
Block TroveBot entirely by adding the following to your site's
<code>robots.txt</code>:
</p>
<pre><code>User-agent: TroveBot
Disallow: /</code></pre>
<p>
Or block specific paths:
</p>
<pre><code>User-agent: TroveBot
Disallow: /private/
Disallow: /admin/</code></pre>
<h2>Contact &amp; source</h2>
<ul>
<li>
Issues, questions, abuse reports:
<a href="https://forge.lvl0.xyz/lvl0/trove/issues">forge.lvl0.xyz/lvl0/trove/issues</a>
</li>
<li>
Source code:
<a href="https://forge.lvl0.xyz/lvl0/trove">forge.lvl0.xyz/lvl0/trove</a>
</li>
</ul>
</main>
@endsection

View file

@ -0,0 +1,18 @@
<!DOCTYPE html>
<html lang="{{ str_replace('_', '-', app()->getLocale()) }}">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Trove @yield('title', config('app.name'))</title>
@vite(['resources/css/app.css', 'resources/js/app.js'])
@livewireStyles
</head>
<body>
@yield('content')
@livewireScripts
</body>
</html>

View file

@ -0,0 +1,14 @@
<div>
@error('rate_limit') <p>{{ $message }}</p> @enderror
@if ($confirmedUrl !== null)
<p>Thanks, we've received <strong>{{ $confirmedUrl }}</strong></p>
@else
<form wire:submit="submit">
<label for="url">URL</label>
<input id="url" type="url" wire:model="url" required>
@error('url') <p>{{ $message }}</p> @enderror
<button type="submit">Submit</button>
</form>
@endif
</div>

View file

@ -0,0 +1,7 @@
@extends('layouts.app')
@section('content')
<livewire:url-submission-form />
@endsection

File diff suppressed because one or more lines are too long

View file

@ -1,8 +1,8 @@
<?php
use Illuminate\Foundation\Inspiring;
use Illuminate\Support\Facades\Artisan;
use Illuminate\Support\Facades\Schedule;
Artisan::command('inspire', function () {
$this->comment(Inspiring::quote());
})->purpose('Display an inspiring quote');
Schedule::command('fedi-discover:poll')
->everyMinute()
->withoutOverlapping(5)
->runInBackground();

View file

@ -1,7 +1,16 @@
<?php
declare(strict_types=1);
use App\Http\Controllers\Admin\InstancesController;
use Illuminate\Support\Facades\Route;
Route::get('/', function () {
return view('welcome');
});
Route::view('/submit', 'urls.submit');
Route::view('/bot', 'bot');
Route::get('/admin/instances', [InstancesController::class, 'index'])->name('admin.instances');

View file

@ -92,6 +92,10 @@ pkgs.mkShell {
podman-compose -f $COMPOSE_FILE exec app php artisan "$@"
}
dev-composer() {
podman-compose -f $COMPOSE_FILE exec app composer "$@"
}
# ===================
# BUILD COMMANDS
# ===================
@ -141,6 +145,7 @@ pkgs.mkShell {
echo " dev-logs-redis Tail Redis logs"
echo " dev-shell Shell into app container"
echo " dev-artisan <cmd> Run artisan command"
echo " dev-composer <cmd> Run composer command"
echo " base-build Build and push image"
echo ""
echo "Services:"

View file

@ -0,0 +1,511 @@
<?php
declare(strict_types=1);
namespace Tests\Feature\Actions;
use App\Actions\FetchPageAction;
use App\Enums\CrawlOutcomeEnum;
use App\Services\LanguageDetectionService;
use App\ValueObjects\FetchResult;
use GuzzleHttp\Exception\ConnectException;
use GuzzleHttp\Psr7\Request;
use Illuminate\Support\Collection;
use Illuminate\Support\Facades\Http;
use Tests\TestCase;
class FetchPageActionTest extends TestCase
{
public function test_successful_html_fetch_returns_success_outcome(): void
{
Http::fake([
'example.com/*' => Http::response(
'<html><body>Hello</body></html>',
200,
['Content-Type' => 'text/html'],
),
]);
$result = $this->makeAction()('https://example.com/page');
$this->assertInstanceOf(FetchResult::class, $result);
$this->assertSame(CrawlOutcomeEnum::Success, $result->outcome);
$this->assertSame(200, $result->statusCode);
$this->assertNotNull($result->finalUrl);
}
public function test_4xx_response_returns_blocked_4xx(): void
{
Http::fake([
'example.com/*' => Http::response('Not Found', 404),
]);
$result = $this->makeAction()('https://example.com/missing');
$this->assertInstanceOf(FetchResult::class, $result);
$this->assertSame(CrawlOutcomeEnum::Blocked4xx, $result->outcome);
$this->assertSame(404, $result->statusCode);
$this->assertIsString($result->errorMessage);
$this->assertStringContainsString('404', $result->errorMessage);
$this->assertNotNull($result->finalUrl);
}
public function test_5xx_response_returns_blocked_5xx(): void
{
Http::fake([
'example.com/*' => Http::response('Service Unavailable', 503),
]);
$result = $this->makeAction()('https://example.com/page');
$this->assertInstanceOf(FetchResult::class, $result);
$this->assertSame(CrawlOutcomeEnum::Blocked5xx, $result->outcome);
$this->assertSame(503, $result->statusCode);
$this->assertIsString($result->errorMessage);
$this->assertStringContainsString('503', $result->errorMessage);
$this->assertNotNull($result->finalUrl);
}
public function test_non_html_content_type_returns_rejected(): void
{
Http::fake([
'example.com/*' => Http::response(
'PDF binary stuff',
200,
['Content-Type' => 'application/pdf'],
),
]);
$result = $this->makeAction()('https://example.com/document.pdf');
$this->assertInstanceOf(FetchResult::class, $result);
$this->assertSame(CrawlOutcomeEnum::Rejected, $result->outcome);
$this->assertSame(200, $result->statusCode);
$this->assertIsString($result->errorMessage);
$this->assertStringContainsString('application/pdf', $result->errorMessage);
$this->assertNotNull($result->finalUrl);
$this->assertNull($result->title);
$this->assertNull($result->extractedText);
$this->assertEmpty($result->outboundLinks);
$this->assertNull($result->wordCount);
}
public function test_text_html_with_charset_is_accepted(): void
{
Http::fake([
'example.com/*' => Http::response(
'<html><body>Hello charset world</body></html>',
200,
['Content-Type' => 'text/html; charset=utf-8'],
),
]);
$result = $this->makeAction()('https://example.com/page');
$this->assertInstanceOf(FetchResult::class, $result);
$this->assertSame(CrawlOutcomeEnum::Success, $result->outcome);
$this->assertSame(200, $result->statusCode);
}
public function test_connection_failure_returns_failed(): void
{
Http::fake(function () {
throw new ConnectException(
'Could not resolve host',
new Request('GET', 'https://example.com/page'),
null,
['errno' => 6],
);
});
$result = $this->makeAction()('https://example.com/page');
$this->assertInstanceOf(FetchResult::class, $result);
$this->assertSame(CrawlOutcomeEnum::Failed, $result->outcome);
$this->assertNull($result->statusCode);
$this->assertNull($result->finalUrl);
$this->assertIsString($result->errorMessage);
$this->assertNull($result->title);
$this->assertNull($result->extractedText);
$this->assertEmpty($result->outboundLinks);
$this->assertNull($result->wordCount);
}
public function test_timeout_returns_timeout(): void
{
Http::fake(function () {
throw new ConnectException(
'cURL error 28: Operation timed out',
new Request('GET', 'https://example.com/page'),
null,
['errno' => 28],
);
});
$result = $this->makeAction()('https://example.com/page');
$this->assertInstanceOf(FetchResult::class, $result);
$this->assertSame(CrawlOutcomeEnum::Timeout, $result->outcome);
$this->assertNull($result->statusCode);
$this->assertNull($result->finalUrl);
$this->assertIsString($result->errorMessage);
}
public function test_success_extracts_title_from_html(): void
{
Http::fake([
'example.com/*' => Http::response(
'<html><head><title>My Page Title</title></head><body><p>Some content.</p></body></html>',
200,
['Content-Type' => 'text/html'],
),
]);
$result = $this->makeAction()('https://example.com/page');
$this->assertSame(CrawlOutcomeEnum::Success, $result->outcome);
$this->assertSame('My Page Title', $result->title);
}
public function test_success_extracts_main_text(): void
{
$html = <<<'HTML'
<!DOCTYPE html>
<html>
<head><title>Article Title</title></head>
<body>
<nav>Navigation links</nav>
<article>
<h1>The Real Article</h1>
<p>This is the main article body that should be extracted by readability.</p>
<p>Multiple paragraphs prove the extractor works on the full content.</p>
</article>
<footer>Site footer noise</footer>
</body>
</html>
HTML;
Http::fake([
'example.com/*' => Http::response($html, 200, ['Content-Type' => 'text/html']),
]);
$result = $this->makeAction()('https://example.com/article');
$this->assertSame(CrawlOutcomeEnum::Success, $result->outcome);
$this->assertNotNull($result->extractedText);
$this->assertStringContainsString('main article body', $result->extractedText);
}
public function test_success_extracts_and_filters_outbound_links(): void
{
$html = <<<'HTML'
<!DOCTYPE html>
<html>
<head><title>Article With Links</title></head>
<body>
<nav>
<a href="/home">Home (nav, should be filtered out by Readability scope)</a>
</nav>
<article>
<h1>Article Title</h1>
<p>This article references <a href="https://other.com/article">an external article</a>.</p>
<p>And a <a href="/related-post">relative link to a related post</a> on the same site.</p>
<p>Plus a <a href="http://192.168.1.1/admin">private IP link</a> that should be rejected.</p>
<p>And a <a href="https://user:pass@evil.com/">credentials URL</a> that should be rejected.</p>
<p>And a <a href="ftp://files.example.com/">non-http scheme</a> that should be rejected.</p>
</article>
<footer>
<a href="/privacy">Privacy (footer, filtered by Readability scope)</a>
</footer>
</body>
</html>
HTML;
Http::fake([
'example.com/*' => Http::response($html, 200, ['Content-Type' => 'text/html']),
]);
$result = $this->makeAction()('https://example.com/article');
$this->assertSame(CrawlOutcomeEnum::Success, $result->outcome);
$this->assertInstanceOf(Collection::class, $result->outboundLinks);
$this->assertSame(2, $result->outboundLinks->count());
$this->assertContains('https://other.com/article', $result->outboundLinks->all());
$this->assertContains('https://example.com/related-post', $result->outboundLinks->all());
$this->assertNotContains('http://192.168.1.1/admin', $result->outboundLinks->all());
$this->assertNotContains('https://user:pass@evil.com/', $result->outboundLinks->all());
$this->assertNotContains('ftp://files.example.com/', $result->outboundLinks->all());
}
public function test_success_calculates_word_count(): void
{
$html = <<<'HTML'
<!DOCTYPE html>
<html>
<head><title>Word Count Test</title></head>
<body>
<article>
<p>This article body has exactly nine words total here.</p>
</article>
</body>
</html>
HTML;
Http::fake([
'example.com/*' => Http::response($html, 200, ['Content-Type' => 'text/html']),
]);
$result = $this->makeAction()('https://example.com/article');
$this->assertSame(CrawlOutcomeEnum::Success, $result->outcome);
$this->assertSame(9, $result->wordCount);
}
public function test_uppercase_content_type_is_accepted_as_html(): void
{
Http::fake([
'example.com/*' => Http::response(
'<html><head><title>Uppercase CT</title></head><body><p>Content here.</p></body></html>',
200,
['Content-Type' => 'Text/HTML; charset=utf-8'],
),
]);
$result = $this->makeAction()('https://example.com/page');
$this->assertSame(CrawlOutcomeEnum::Success, $result->outcome);
}
public function test_empty_href_is_filtered_from_outbound_links(): void
{
$html = <<<'HTML'
<!DOCTYPE html>
<html>
<head><title>Empty Href Test</title></head>
<body>
<article>
<p>This paragraph has <a href="">an empty href anchor</a> that should be dropped.</p>
</article>
</body>
</html>
HTML;
Http::fake([
'example.com/*' => Http::response($html, 200, ['Content-Type' => 'text/html']),
]);
$result = $this->makeAction()('https://example.com/article');
$this->assertSame(CrawlOutcomeEnum::Success, $result->outcome);
$this->assertSame(0, $result->outboundLinks->count());
}
public function test_fragment_only_href_is_filtered_from_outbound_links(): void
{
$html = <<<'HTML'
<!DOCTYPE html>
<html>
<head><title>Fragment Href Test</title></head>
<body>
<article>
<p>Jump to <a href="#section-2">section 2</a> of this page.</p>
</article>
</body>
</html>
HTML;
Http::fake([
'example.com/*' => Http::response($html, 200, ['Content-Type' => 'text/html']),
]);
$result = $this->makeAction()('https://example.com/article');
$this->assertSame(CrawlOutcomeEnum::Success, $result->outcome);
$this->assertSame(0, $result->outboundLinks->count());
}
public function test_sufficient_text_triggers_language_detection_and_result_propagates(): void
{
// 24 words — above the detection threshold
$body = <<<'HTML'
<!DOCTYPE html>
<html>
<head><title>Language Detection Test</title></head>
<body>
<article>
<p>The quick brown fox jumps over the lazy dog and then runs away into the forest
where many other animals live and play together every single day.</p>
</article>
</body>
</html>
HTML;
Http::fake([
'example.com/*' => Http::response($body, 200, ['Content-Type' => 'text/html']),
]);
$detection = $this->mock(LanguageDetectionService::class);
$detection->shouldReceive('detect')
->once()
->andReturn(['en', 0.95]);
$result = $this->makeAction()('https://example.com/article');
$this->assertSame(CrawlOutcomeEnum::Success, $result->outcome);
$this->assertSame('en', $result->language);
$this->assertSame(0.95, $result->languageConfidence);
}
public function test_short_body_with_html_lang_attr_skips_service_and_uses_lang_attr(): void
{
// 7 words — below the detection threshold
$html = <<<'HTML'
<!DOCTYPE html>
<html lang="pt-BR">
<head><title>Short Page</title></head>
<body>
<article>
<p>Too short to detect language automatically.</p>
</article>
</body>
</html>
HTML;
Http::fake([
'example.com/*' => Http::response($html, 200, ['Content-Type' => 'text/html']),
]);
$detection = $this->mock(LanguageDetectionService::class);
$detection->shouldReceive('detect')->never();
$result = $this->makeAction()('https://example.com/article');
$this->assertSame(CrawlOutcomeEnum::Success, $result->outcome);
$this->assertSame('pt-BR', $result->language);
$this->assertSame(1.0, $result->languageConfidence);
}
public function test_short_body_with_no_lang_attr_returns_null_language(): void
{
// 7 words — below the detection threshold
$html = <<<'HTML'
<!DOCTYPE html>
<html>
<head><title>Short Page</title></head>
<body>
<article>
<p>Too short to detect language automatically.</p>
</article>
</body>
</html>
HTML;
Http::fake([
'example.com/*' => Http::response($html, 200, ['Content-Type' => 'text/html']),
]);
$detection = $this->mock(LanguageDetectionService::class);
$detection->shouldReceive('detect')->never();
$result = $this->makeAction()('https://example.com/article');
$this->assertSame(CrawlOutcomeEnum::Success, $result->outcome);
$this->assertNull($result->language);
$this->assertNull($result->languageConfidence);
}
public function test_whitespace_only_lang_attr_is_treated_as_absent(): void
{
// 7 words — below the detection threshold; lang attr is blank/whitespace-only
$html = <<<'HTML'
<!DOCTYPE html>
<html lang=" ">
<head><title>Short Page</title></head>
<body>
<article>
<p>Too short to detect language automatically.</p>
</article>
</body>
</html>
HTML;
Http::fake([
'example.com/*' => Http::response($html, 200, ['Content-Type' => 'text/html']),
]);
$detection = $this->mock(LanguageDetectionService::class);
$detection->shouldReceive('detect')->never();
$result = $this->makeAction()('https://example.com/article');
$this->assertSame(CrawlOutcomeEnum::Success, $result->outcome);
$this->assertNull($result->language);
$this->assertNull($result->languageConfidence);
}
public function test_lang_attr_longer_than_35_chars_is_rejected(): void
{
// 7 words — below the detection threshold; lang attr exceeds BCP-47 column width (string(35))
$html = <<<'HTML'
<!DOCTYPE html>
<html lang="aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa-bbb">
<head><title>Short Page</title></head>
<body>
<article>
<p>Too short to detect language automatically.</p>
</article>
</body>
</html>
HTML;
Http::fake([
'example.com/*' => Http::response($html, 200, ['Content-Type' => 'text/html']),
]);
$detection = $this->mock(LanguageDetectionService::class);
$detection->shouldReceive('detect')->never();
$result = $this->makeAction()('https://example.com/article');
$this->assertSame(CrawlOutcomeEnum::Success, $result->outcome);
$this->assertNull($result->language);
$this->assertNull($result->languageConfidence);
}
public function test_low_confidence_detection_falls_through_to_lang_attr(): void
{
// 24 words — above the detection threshold; service returns low-confidence result
$html = <<<'HTML'
<!DOCTYPE html>
<html lang="en-US">
<head><title>Confidence Floor Test</title></head>
<body>
<article>
<p>The quick brown fox jumps over the lazy dog and then runs away into the forest
where many other animals live and play together every single day.</p>
</article>
</body>
</html>
HTML;
Http::fake([
'example.com/*' => Http::response($html, 200, ['Content-Type' => 'text/html']),
]);
$detection = $this->mock(LanguageDetectionService::class);
$detection->shouldReceive('detect')
->once()
->andReturn(['xx', 0.15]);
$result = $this->makeAction()('https://example.com/article');
$this->assertSame(CrawlOutcomeEnum::Success, $result->outcome);
$this->assertSame('en-US', $result->language);
$this->assertSame(1.0, $result->languageConfidence);
}
private function makeAction(): FetchPageAction
{
return app(FetchPageAction::class);
}
}

View file

@ -0,0 +1,133 @@
<?php
declare(strict_types=1);
namespace Tests\Feature\Admin;
use App\Enums\PageStatusEnum;
use App\Models\Page;
use Illuminate\Foundation\Testing\RefreshDatabase;
use Lvl0\FediDiscover\Config\InstanceType;
use Lvl0\FediDiscover\Models\Instance;
use Tests\TestCase;
class InstancesAdminPageTest extends TestCase
{
use RefreshDatabase;
public function test_admin_instances_page_is_accessible(): void
{
$response = $this->get('/admin/instances');
$response->assertStatus(200);
}
public function test_admin_instances_page_shows_each_instance_url_and_last_polled_at(): void
{
$mastodon = Instance::factory()
->type(InstanceType::Mastodon)
->enabled()
->create([
'url' => 'https://mastodon.social',
'last_polled_at' => '2024-06-01 12:00:00',
]);
$lemmy = Instance::factory()
->type(InstanceType::Lemmy)
->enabled()
->create([
'url' => 'https://lemmy.world',
'last_polled_at' => '2024-06-01 13:00:00',
]);
$response = $this->get('/admin/instances');
$response->assertSee($mastodon->url);
$response->assertSee($lemmy->url);
$response->assertSee($mastodon->last_polled_at->toDateString());
$response->assertSee($lemmy->last_polled_at->toDateString());
}
public function test_admin_instances_page_shows_error_count_per_instance(): void
{
$first = Instance::factory()
->type(InstanceType::Mastodon)
->enabled()
->create(['url' => 'https://aardvark.example']);
$second = Instance::factory()
->type(InstanceType::Lemmy)
->enabled()
->create(['url' => 'https://zebra.example']);
// First instance: 3 failed + 2 non-failed pages
Page::factory()
->count(3)
->sequence(fn ($s) => ['url' => "https://aardvark.example/fail-{$s->index}"])
->createQuietly(['instance_id' => $first->id, 'status' => PageStatusEnum::Failed]);
Page::factory()
->count(2)
->sequence(fn ($s) => ['url' => "https://aardvark.example/ok-{$s->index}"])
->createQuietly(['instance_id' => $first->id, 'status' => PageStatusEnum::Fetched]);
// Second instance: 1 failed + 4 non-failed pages
Page::factory()
->count(1)
->sequence(fn ($s) => ['url' => "https://zebra.example/fail-{$s->index}"])
->createQuietly(['instance_id' => $second->id, 'status' => PageStatusEnum::Failed]);
Page::factory()
->count(4)
->sequence(fn ($s) => ['url' => "https://zebra.example/ok-{$s->index}"])
->createQuietly(['instance_id' => $second->id, 'status' => PageStatusEnum::Fetched]);
$response = $this->get('/admin/instances');
// Each error-count cell must render as "{n} errors" — this string cannot
// collide with dates, IDs, or the "URLs" column. The counts (3 and 1)
// are distinct and non-equal so the assertion proves per-row mapping,
// not a leaked total.
$response->assertSeeInOrder([
$first->url,
'3 errors',
$second->url,
'1 errors',
]);
}
public function test_admin_instances_page_shows_url_count_per_instance(): void
{
$first = Instance::factory()
->type(InstanceType::Mastodon)
->enabled()
->create(['url' => 'https://aardvark.example']);
$second = Instance::factory()
->type(InstanceType::Lemmy)
->enabled()
->create(['url' => 'https://zebra.example']);
Page::factory()
->count(7)
->sequence(fn ($s) => ['url' => "https://aardvark.example/page-{$s->index}"])
->createQuietly(['instance_id' => $first->id]);
Page::factory()
->count(2)
->sequence(fn ($s) => ['url' => "https://zebra.example/page-{$s->index}"])
->createQuietly(['instance_id' => $second->id]);
$response = $this->get('/admin/instances');
// Each count cell must render as "{n} URLs" — this string cannot
// collide with dates, IDs, or any other incidental numeric content,
// so the assertion only passes when a real count column is wired in.
$response->assertSeeInOrder([
$first->url,
'7 URLs',
$second->url,
'2 URLs',
]);
}
}

View file

@ -0,0 +1,39 @@
<?php
declare(strict_types=1);
namespace Tests\Feature;
use Tests\TestCase;
class BotPageTest extends TestCase
{
public function test_bot_page_renders_at_public_route(): void
{
$response = $this->get('/bot');
$response->assertStatus(200);
}
public function test_bot_page_contains_user_agent_string(): void
{
$response = $this->get('/bot');
$response->assertSee('TroveBot/0.1 (+https://trove.lvl0.xyz/bot)', escape: false);
}
public function test_bot_page_contains_robots_txt_opt_out_example(): void
{
$response = $this->get('/bot');
$response->assertSee('User-agent: TroveBot', escape: false);
$response->assertSee('Disallow: /', escape: false);
}
public function test_bot_page_links_to_forge_repository(): void
{
$response = $this->get('/bot');
$response->assertSee('https://forge.lvl0.xyz/lvl0/trove', escape: false);
}
}

View file

@ -1,19 +0,0 @@
<?php
namespace Tests\Feature;
// use Illuminate\Foundation\Testing\RefreshDatabase;
use Tests\TestCase;
class ExampleTest extends TestCase
{
/**
* A basic test example.
*/
public function test_the_application_returns_a_successful_response(): void
{
$response = $this->get('/');
$response->assertStatus(200);
}
}

View file

@ -0,0 +1,573 @@
<?php
declare(strict_types=1);
namespace Tests\Feature\Jobs;
use App\Actions\FetchPageAction;
use App\Enums\CrawlOutcomeEnum;
use App\Enums\PageStatusEnum;
use App\Jobs\ProcessCrawlJob;
use App\Models\Page;
use App\Models\PageCrawl;
use App\ValueObjects\FetchResult;
use Carbon\Carbon;
use Illuminate\Foundation\Testing\RefreshDatabase;
use Illuminate\Support\Collection;
use Illuminate\Support\Facades\Cache;
use Illuminate\Support\Facades\Http;
use Illuminate\Support\Facades\Queue;
use Mockery;
use Tests\TestCase;
class ProcessCrawlJobTest extends TestCase
{
use RefreshDatabase;
public function test_creating_a_page_crawl_dispatches_process_crawl_job(): void
{
Queue::fake();
$page = Page::factory()->createQuietly(['url' => 'https://example.com/article']);
PageCrawl::factory()->page($page)->create();
Queue::assertPushed(ProcessCrawlJob::class);
}
public function test_dispatched_job_carries_the_correct_page_crawl(): void
{
Queue::fake();
$page = Page::factory()->createQuietly(['url' => 'https://example.com/article']);
$crawl = PageCrawl::factory()->page($page)->create();
Queue::assertPushed(
ProcessCrawlJob::class,
fn (ProcessCrawlJob $job) => $job->pageCrawl->id === $crawl->id,
);
}
public function test_handle_writes_outcome_to_page_crawl_on_success(): void
{
Queue::fake();
$this->mockFetchPageAction(CrawlOutcomeEnum::Success, statusCode: 200, title: 'Hello', extractedText: 'hi', wordCount: 1);
$page = Page::factory()->createQuietly(['url' => 'https://example.com/article']);
$crawl = PageCrawl::factory()->page($page)->createQuietly();
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
->handle();
$fresh = $crawl->fresh();
$this->assertSame(CrawlOutcomeEnum::Success, $fresh->outcome);
$this->assertNotNull($fresh->completed_at);
$this->assertInstanceOf(Carbon::class, $fresh->completed_at);
$this->assertSame(200, $fresh->status_code);
$this->assertNull($fresh->error_message);
}
public function test_handle_updates_page_to_fetched_on_success(): void
{
Queue::fake();
$this->mockFetchPageAction(CrawlOutcomeEnum::Success, statusCode: 200, title: 'Hello', extractedText: 'hi', wordCount: 1);
$page = Page::factory()->createQuietly(['url' => 'https://example.com/article']);
$crawl = PageCrawl::factory()->page($page)->createQuietly();
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
->handle();
$fresh = $page->fresh();
$this->assertSame(PageStatusEnum::Fetched, $fresh->status);
$this->assertNotNull($fresh->fetched_at);
$this->assertInstanceOf(Carbon::class, $fresh->fetched_at);
$this->assertSame('Hello', $fresh->title);
}
public function test_handle_updates_page_to_rejected_on_rejected_outcome(): void
{
Queue::fake();
$this->mockFetchPageAction(CrawlOutcomeEnum::Rejected, statusCode: 200, errorMessage: 'Unsupported Content-Type: application/pdf');
$page = Page::factory()->createQuietly(['url' => 'https://example.com/brochure.pdf']);
$crawl = PageCrawl::factory()->page($page)->createQuietly();
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
->handle();
$fresh = $page->fresh();
$this->assertSame(PageStatusEnum::Rejected, $fresh->status);
$this->assertNull($fresh->fetched_at);
}
public function test_handle_updates_page_to_failed_on_blocked_4xx(): void
{
Queue::fake();
$this->mockFetchPageAction(CrawlOutcomeEnum::Blocked4xx, statusCode: 404, errorMessage: 'HTTP 404');
$page = Page::factory()->createQuietly(['url' => 'https://example.com/gone']);
$crawl = PageCrawl::factory()->page($page)->createQuietly();
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
->handle();
$fresh = $page->fresh();
$this->assertSame(PageStatusEnum::Failed, $fresh->status);
$this->assertNotNull($fresh->failed_at);
$this->assertInstanceOf(Carbon::class, $fresh->failed_at);
}
public function test_handle_updates_page_to_failed_on_timeout(): void
{
Queue::fake();
$this->mockFetchPageAction(CrawlOutcomeEnum::Timeout, errorMessage: 'Connection timed out after 10 seconds');
$page = Page::factory()->createQuietly(['url' => 'https://example.com/slow']);
$crawl = PageCrawl::factory()->page($page)->createQuietly();
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
->handle();
$fresh = $page->fresh();
$this->assertSame(PageStatusEnum::Failed, $fresh->status);
$this->assertNotNull($fresh->failed_at);
$this->assertInstanceOf(Carbon::class, $fresh->failed_at);
}
public function test_handle_schedules_retry_on_transient_failure(): void
{
Queue::fake();
$this->mockFetchPageAction(CrawlOutcomeEnum::Failed, errorMessage: 'Connection refused');
$page = Page::factory()->createQuietly(['url' => 'https://example.com/unstable']);
$crawl = PageCrawl::factory()->page($page)->createQuietly();
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
->handle();
// A second PageCrawl row (the retry) must have been inserted for the same page
$this->assertSame(2, PageCrawl::where('page_id', $page->id)->count());
// The new row is pending — outcome IS NULL
$retryRow = PageCrawl::where('page_id', $page->id)
->whereNull('outcome')
->first();
$this->assertNotNull($retryRow);
// A delayed ProcessCrawlJob must have been pushed for the retry row
Queue::assertPushed(
ProcessCrawlJob::class,
fn (ProcessCrawlJob $job) => $job->pageCrawl->page_id === $page->id
&& $job->pageCrawl->id === $retryRow->id,
);
}
public function test_handle_does_not_retry_after_three_attempts(): void
{
Queue::fake();
$this->mockFetchPageAction(CrawlOutcomeEnum::Failed, errorMessage: 'Connection refused');
$page = Page::factory()->createQuietly(['url' => 'https://example.com/unreachable']);
// 3 prior attempts already exist — this is the cap
PageCrawl::factory()->page($page)->failed('Connection refused')->createQuietly();
PageCrawl::factory()->page($page)->failed('Connection refused')->createQuietly();
$thirdCrawl = PageCrawl::factory()->page($page)->createQuietly();
app(ProcessCrawlJob::class, ['pageCrawl' => $thirdCrawl])
->handle();
// No 4th row must appear — retry cap reached
$this->assertSame(3, PageCrawl::where('page_id', $page->id)->count());
// No retry job dispatched
Queue::assertNotPushed(ProcessCrawlJob::class);
}
public function test_handle_writes_failed_outcome_to_page_crawl(): void
{
Queue::fake();
$this->mockFetchPageAction(CrawlOutcomeEnum::Failed, errorMessage: 'boom');
$page = Page::factory()->createQuietly(['url' => 'https://example.com/unstable']);
$crawl = PageCrawl::factory()->page($page)->createQuietly();
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
->handle();
$this->assertDatabaseHas('page_crawls', [
'id' => $crawl->id,
'outcome' => CrawlOutcomeEnum::Failed->value,
'status_code' => null,
'error_message' => 'boom',
]);
}
public function test_handle_updates_page_to_failed_on_failed_outcome(): void
{
Queue::fake();
$this->mockFetchPageAction(CrawlOutcomeEnum::Failed, errorMessage: 'Connection refused');
$page = Page::factory()->createQuietly(['url' => 'https://example.com/unreachable']);
$crawl = PageCrawl::factory()->page($page)->createQuietly();
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
->handle();
$this->assertSame(PageStatusEnum::Failed, $page->fresh()->status);
}
public function test_handle_updates_page_to_failed_on_blocked_5xx(): void
{
Queue::fake();
$this->mockFetchPageAction(CrawlOutcomeEnum::Blocked5xx, statusCode: 503, errorMessage: 'HTTP 503');
$page = Page::factory()->createQuietly(['url' => 'https://example.com/overloaded']);
$crawl = PageCrawl::factory()->page($page)->createQuietly();
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
->handle();
$this->assertSame(PageStatusEnum::Failed, $page->fresh()->status);
}
public function test_handle_updates_page_to_failed_on_blocked_robots(): void
{
Queue::fake();
$this->mockFetchPageAction(CrawlOutcomeEnum::BlockedRobots, errorMessage: 'Disallowed by robots.txt');
$page = Page::factory()->createQuietly(['url' => 'https://example.com/private']);
$crawl = PageCrawl::factory()->page($page)->createQuietly();
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
->handle();
$this->assertSame(PageStatusEnum::Failed, $page->fresh()->status);
}
public function test_handle_does_not_register_outbound_links_on_failure(): void
{
Queue::fake();
$this->mockFetchPageAction(
CrawlOutcomeEnum::Failed,
outboundLinks: collect(['https://should-not-be-registered.com/page']),
errorMessage: 'Connection refused',
);
$page = Page::factory()->createQuietly(['url' => 'https://example.com/broken']);
$crawl = PageCrawl::factory()->page($page)->createQuietly();
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
->handle();
$this->assertDatabaseMissing('pages', ['url' => 'https://should-not-be-registered.com/page']);
$this->assertSame(1, Page::count());
}
public function test_handle_registers_outbound_links_on_success(): void
{
Queue::fake();
$this->mockFetchPageAction(
CrawlOutcomeEnum::Success,
statusCode: 200,
finalUrl: 'https://source.com/article',
title: 'Source Article',
extractedText: 'some text',
outboundLinks: collect(['https://other.com/article-1', 'https://another.com/post-2']),
wordCount: 2,
);
$page = Page::factory()->createQuietly(['url' => 'https://source.com/article']);
$crawl = PageCrawl::factory()->page($page)->createQuietly();
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
->handle();
$this->assertDatabaseHas('pages', ['url' => 'https://other.com/article-1']);
$this->assertDatabaseHas('pages', ['url' => 'https://another.com/post-2']);
$this->assertSame(3, Page::count());
}
public function test_handle_releases_job_when_domain_is_locked(): void
{
Queue::fake();
// Pre-acquire the lock so the job sees it as already held
Cache::lock('crawler:domain:example.com', 10)->get();
// The fetcher must NOT be called — the job should bail before reaching it
$fetcher = Mockery::mock(FetchPageAction::class);
$fetcher->shouldNotReceive('__invoke');
$this->app->instance(FetchPageAction::class, $fetcher);
$page = Page::factory()->createQuietly(['url' => 'https://example.com/article']);
$crawl = PageCrawl::factory()->page($page)->createQuietly();
$job = new ProcessCrawlJob($crawl);
$job->handle();
// No outcome written — handle() returned early
$this->assertNull($crawl->fresh()->outcome);
// Page status unchanged from its factory default (Discovered)
$this->assertSame(PageStatusEnum::Discovered, $page->fresh()->status);
}
public function test_handle_does_not_release_lock_after_completion(): void
{
Queue::fake();
$this->mockFetchPageAction(CrawlOutcomeEnum::Success, statusCode: 200);
$page = Page::factory()->createQuietly(['url' => 'https://example.com/article']);
$crawl = PageCrawl::factory()->page($page)->createQuietly();
$job = new ProcessCrawlJob($crawl);
$job->handle();
// If handle() called $lock->release(), this second get() would succeed (true).
// It must fail (false) — the lock acquired inside handle() must still be held.
$result = Cache::lock('crawler:domain:example.com', 10)->get();
$this->assertFalse($result, 'Expected the domain lock to still be held after handle() completed, but it was free — the lock was released prematurely.');
}
public function test_handle_writes_blocked_robots_when_disallowed(): void
{
Queue::fake();
Http::fake([
'https://example.com/robots.txt' => Http::response(
"User-agent: *\nDisallow: /",
200,
),
]);
// FetchPageAction must never be called — the robots gate returns before the lock
$fetcher = Mockery::mock(FetchPageAction::class);
$fetcher->shouldNotReceive('__invoke');
$this->app->instance(FetchPageAction::class, $fetcher);
$page = Page::factory()->createQuietly(['url' => 'https://example.com/private']);
$crawl = PageCrawl::factory()->page($page)->createQuietly();
$domain = $crawl->domain;
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
->handle();
// Outcome row must record BlockedRobots
$this->assertDatabaseHas('page_crawls', [
'id' => $crawl->id,
'outcome' => CrawlOutcomeEnum::BlockedRobots->value,
]);
// Page status must be Failed (BlockedRobots::toPageStatus() === Failed)
$this->assertSame(PageStatusEnum::Failed, $page->fresh()->status);
// The politeness lock must still be acquirable — the gate returned before ever claiming it
$this->assertTrue(
Cache::lock("crawler:domain:{$domain}", 10)->get(),
'Expected the politeness lock to be free (gate returned before acquiring it), but it was already held.',
);
}
public function test_handle_acquires_domain_lock_before_fetching(): void
{
Queue::fake();
$this->mockFetchPageAction(CrawlOutcomeEnum::Success, statusCode: 200);
$page = Page::factory()->createQuietly(['url' => 'https://lock-test.example.com/article']);
$crawl = PageCrawl::factory()->page($page)->createQuietly();
$domain = $crawl->domain;
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
->handle();
// The lock must still be held after handle() completes — a second attempt to acquire it fails
$this->assertFalse(
Cache::lock("crawler:domain:{$domain}", 10)->get(),
'Expected the domain lock to still be held after handle() ran, but it was free.',
);
// The fetch ran — outcome was written (proves the lock did not block execution)
$this->assertSame(CrawlOutcomeEnum::Success, $crawl->fresh()->outcome);
}
public function test_handle_proceeds_through_politeness_lock_when_robots_allow(): void
{
Queue::fake();
Http::fake([
'https://example.com/robots.txt' => Http::response(
"User-agent: *\nAllow: /",
200,
),
]);
// FetchPageAction must be called exactly once — robots gate passed, fetch proceeds
$fetcher = Mockery::mock(FetchPageAction::class);
$fetcher->shouldReceive('__invoke')->once()->andReturn(new FetchResult(
outcome: CrawlOutcomeEnum::Success,
statusCode: 200,
finalUrl: 'https://example.com/article',
title: 'Hello',
extractedText: 'hi',
outboundLinks: collect(),
wordCount: 1,
errorMessage: null,
));
$this->app->instance(FetchPageAction::class, $fetcher);
$page = Page::factory()->createQuietly(['url' => 'https://example.com/article']);
$crawl = PageCrawl::factory()->page($page)->createQuietly();
$domain = $crawl->domain;
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
->handle();
// Outcome must be Success — not BlockedRobots
$this->assertDatabaseHas('page_crawls', [
'id' => $crawl->id,
'outcome' => CrawlOutcomeEnum::Success->value,
]);
// Page status must have advanced to Fetched
$this->assertSame(PageStatusEnum::Fetched, $page->fresh()->status);
// Politeness lock must still be held (claimed during the fetch, never released)
$this->assertFalse(
Cache::lock("crawler:domain:{$domain}", 10)->get(),
'Expected the politeness lock to be held after a successful fetch, but it was free.',
);
}
public function test_handle_persists_language_on_success(): void
{
Queue::fake();
$this->mockFetchPageAction(
CrawlOutcomeEnum::Success,
statusCode: 200,
title: 'Hello',
extractedText: 'hi',
wordCount: 1,
language: 'en',
languageConfidence: 0.95,
);
$page = Page::factory()->createQuietly(['url' => 'https://example.com/article']);
$crawl = PageCrawl::factory()->page($page)->createQuietly();
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
->handle();
$fresh = $page->fresh();
$this->assertSame('en', $fresh->language);
$this->assertEqualsWithDelta(0.95, $fresh->language_confidence, 0.001);
}
public function test_handle_does_not_overwrite_existing_language_when_new_fetch_returns_null(): void
{
Queue::fake();
$this->mockFetchPageAction(
CrawlOutcomeEnum::Success,
statusCode: 200,
title: 'Hello',
extractedText: 'hi',
wordCount: 1,
language: null,
languageConfidence: null,
);
// Page already has a language from a previous fetch
$page = Page::factory()->createQuietly([
'url' => 'https://example.com/article',
'language' => 'en',
'language_confidence' => 0.95,
]);
$crawl = PageCrawl::factory()->page($page)->createQuietly();
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
->handle();
$fresh = $page->fresh();
// Language columns must be sticky — null detection must NOT overwrite them
$this->assertSame('en', $fresh->language);
$this->assertEqualsWithDelta(0.95, $fresh->language_confidence, 0.001);
// Other columns must still update — sticky applies to language only
$this->assertSame(PageStatusEnum::Fetched, $fresh->status);
$this->assertSame('Hello', $fresh->title);
}
public function test_handle_leaves_language_null_when_no_prior_and_no_detection(): void
{
Queue::fake();
$this->mockFetchPageAction(
CrawlOutcomeEnum::Success,
statusCode: 200,
title: 'Hello',
extractedText: 'hi',
wordCount: 1,
language: null,
languageConfidence: null,
);
$page = Page::factory()->createQuietly(['url' => 'https://example.com/article']);
$crawl = PageCrawl::factory()->page($page)->createQuietly();
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
->handle();
$fresh = $page->fresh();
$this->assertNull($fresh->language);
$this->assertNull($fresh->language_confidence);
}
private function mockFetchPageAction(
CrawlOutcomeEnum $outcome,
?int $statusCode = null,
?string $finalUrl = 'https://example.com/article',
?string $title = null,
?string $extractedText = null,
?Collection $outboundLinks = null,
?int $wordCount = null,
?string $errorMessage = null,
?string $language = null,
?float $languageConfidence = null,
): void {
$fetcher = Mockery::mock(FetchPageAction::class);
$fetcher->shouldReceive('__invoke')->andReturn(new FetchResult(
outcome: $outcome,
statusCode: $statusCode,
finalUrl: $finalUrl,
title: $title,
extractedText: $extractedText,
outboundLinks: $outboundLinks ?? collect(),
wordCount: $wordCount,
errorMessage: $errorMessage,
language: $language,
languageConfidence: $languageConfidence,
));
$this->app->instance(FetchPageAction::class, $fetcher);
}
}

View file

@ -0,0 +1,52 @@
<?php
declare(strict_types=1);
namespace Tests\Feature\Listeners;
use App\Listeners\PollFailedListener;
use App\Services\PollAlertService;
use Carbon\CarbonImmutable;
use Illuminate\Contracts\Queue\ShouldQueue;
use Illuminate\Foundation\Testing\RefreshDatabase;
use Lvl0\FediDiscover\Config\InstanceType;
use Lvl0\FediDiscover\Events\PollFailed;
use Lvl0\FediDiscover\Models\Instance;
use Mockery;
use Tests\TestCase;
class PollFailedListenerTest extends TestCase
{
use RefreshDatabase;
public function test_handle_calls_record_failure_with_the_event_instance_and_message(): void
{
$instance = Instance::factory()
->type(InstanceType::Mastodon)
->enabled()
->create(['consecutive_poll_failures' => 0]);
$message = 'connection timed out';
$failedAt = CarbonImmutable::now();
$event = new PollFailed($instance, $message, $failedAt);
$service = Mockery::mock(PollAlertService::class);
$service->shouldReceive('recordFailure')
->once()
->with(
Mockery::on(fn (Instance $i) => $i->is($instance)),
$message,
);
$listener = new PollFailedListener($service);
$listener->handle($event);
}
public function test_listener_is_not_queued(): void
{
$this->assertNotInstanceOf(
ShouldQueue::class,
new PollFailedListener($this->createStub(PollAlertService::class)),
);
}
}

View file

@ -0,0 +1,70 @@
<?php
declare(strict_types=1);
namespace Tests\Feature;
use App\Models\Page;
use App\Models\PageCrawl;
use App\Services\UrlService;
use Illuminate\Foundation\Testing\RefreshDatabase;
use Tests\TestCase;
class PageQueuePopulationTest extends TestCase
{
use RefreshDatabase;
public function test_creating_a_page_inserts_a_page_crawl_row(): void
{
$url = 'https://example-blog.com/article';
$page = Page::factory()->create(['url' => $url]);
$expectedDomain = (new UrlService)->host($url);
$this->assertDatabaseHas('page_crawls', [
'page_id' => $page->id,
'domain' => $expectedDomain,
'priority' => 0,
]);
$crawl = PageCrawl::where('page_id', $page->id)->first();
$this->assertNotNull($crawl);
}
public function test_first_or_create_with_existing_url_does_not_insert_duplicate_crawl(): void
{
$url = 'https://example-blog.com/article';
Page::factory()->create(['url' => $url]);
// Finds the existing row — created event does not fire again
Page::firstOrCreate(['url' => $url], ['status' => 'discovered']);
$this->assertDatabaseCount('page_crawls', 1);
}
public function test_updating_a_page_does_not_insert_another_crawl(): void
{
$page = Page::factory()->create(['url' => 'https://example-blog.com/article']);
$page->update(['title' => 'New Title']);
$this->assertDatabaseCount('page_crawls', 1);
}
public function test_bad_url_throws_exception_page_persists_no_crawl_inserted(): void
{
$caught = null;
try {
Page::create(['url' => 'not-a-url', 'status' => 'discovered']);
} catch (\InvalidArgumentException $e) {
$caught = $e;
}
$this->assertNotNull($caught, 'Expected InvalidArgumentException to be thrown');
$this->assertDatabaseHas('pages', ['url' => 'not-a-url']);
$this->assertDatabaseCount('page_crawls', 0);
}
}

View file

@ -0,0 +1,37 @@
<?php
declare(strict_types=1);
namespace Tests\Feature;
use Illuminate\Foundation\Testing\RefreshDatabase;
use Illuminate\Support\Facades\Http;
use Lvl0\FediDiscover\Actions\PollFediverseAction;
use Lvl0\FediDiscover\Config\InstanceType;
use Lvl0\FediDiscover\Models\Instance;
use RuntimeException;
use Tests\TestCase;
class PollFailedIntegrationTest extends TestCase
{
use RefreshDatabase;
public function test_poll_failure_increments_consecutive_poll_failures_via_full_chain(): void
{
Http::fake();
$instance = Instance::factory()
->type(InstanceType::Mastodon)
->enabled()
->create(['consecutive_poll_failures' => 0]);
$this->mock(PollFediverseAction::class)
->shouldReceive('execute')
->once()
->andThrow(new RuntimeException('connection refused'));
$this->artisan('fedi-discover:poll');
$this->assertSame(1, $instance->fresh()->consecutive_poll_failures);
}
}

View file

@ -0,0 +1,171 @@
<?php
declare(strict_types=1);
namespace Tests\Feature\Services;
use App\Services\PollAlertService;
use Illuminate\Foundation\Testing\RefreshDatabase;
use Illuminate\Support\Facades\Http;
use Lvl0\FediDiscover\Config\InstanceType;
use Lvl0\FediDiscover\Models\Instance;
use Tests\TestCase;
class PollAlertServiceTest extends TestCase
{
use RefreshDatabase;
public function test_record_failure_increments_consecutive_poll_failures_on_the_instance(): void
{
$instance = Instance::factory()
->type(InstanceType::Mastodon)
->enabled()
->create(['consecutive_poll_failures' => 0]);
$service = new PollAlertService;
$service->recordFailure($instance, 'test');
$this->assertDatabaseHas('fedi_discover_instances', [
'id' => $instance->id,
'consecutive_poll_failures' => 1,
]);
}
public function test_no_alert_sent_below_threshold(): void
{
Http::fake();
config([
'services.ntfy.url' => 'https://ntfy.example.com',
'services.ntfy.topic' => 'trove-alerts',
'services.ntfy.threshold' => 3,
]);
$instance = Instance::factory()
->type(InstanceType::Mastodon)
->enabled()
->create(['consecutive_poll_failures' => 1]); // will become 2 after recordFailure
$service = new PollAlertService;
$service->recordFailure($instance, 'test');
Http::assertNothingSent();
}
public function test_alert_sent_when_threshold_is_reached(): void
{
Http::fake();
config([
'services.ntfy.url' => 'https://ntfy.example.com',
'services.ntfy.topic' => 'trove-alerts',
'services.ntfy.threshold' => 3,
]);
$instance = Instance::factory()
->type(InstanceType::Mastodon)
->enabled()
->create(['consecutive_poll_failures' => 2]); // will become 3 after recordFailure = exactly at threshold
$service = new PollAlertService;
$service->recordFailure($instance, 'test');
Http::assertSent(function ($request) {
return $request->url() === 'https://ntfy.example.com/trove-alerts'
&& $request->method() === 'POST';
});
}
public function test_alert_sent_when_count_exceeds_threshold(): void
{
Http::fake();
config([
'services.ntfy.url' => 'https://ntfy.example.com',
'services.ntfy.topic' => 'trove-alerts',
'services.ntfy.threshold' => 3,
]);
$instance = Instance::factory()
->type(InstanceType::Mastodon)
->enabled()
->create(['consecutive_poll_failures' => 3]); // will become 4 after recordFailure = above threshold
$service = new PollAlertService;
$service->recordFailure($instance, 'test');
Http::assertSent(function ($request) {
return $request->url() === 'https://ntfy.example.com/trove-alerts'
&& $request->method() === 'POST';
});
}
public function test_no_alert_sent_when_threshold_is_zero(): void
{
Http::fake();
config([
'services.ntfy.url' => 'https://ntfy.example.com',
'services.ntfy.topic' => 'trove-alerts',
'services.ntfy.threshold' => 0,
]);
$instance = Instance::factory()
->type(InstanceType::Mastodon)
->enabled()
->create(['consecutive_poll_failures' => 5]);
$service = new PollAlertService;
$service->recordFailure($instance, 'test');
Http::assertNothingSent();
}
public function test_no_alert_sent_when_topic_is_null(): void
{
Http::fake();
config([
'services.ntfy.url' => 'https://ntfy.example.com',
'services.ntfy.topic' => null,
'services.ntfy.threshold' => 3,
]);
$instance = Instance::factory()
->type(InstanceType::Mastodon)
->enabled()
->create(['consecutive_poll_failures' => 2]); // will become 3 after recordFailure = at threshold
$service = new PollAlertService;
$service->recordFailure($instance, 'test');
Http::assertNothingSent();
}
public function test_alert_body_contains_instance_url_and_message(): void
{
Http::fake();
config([
'services.ntfy.url' => 'https://ntfy.example.com',
'services.ntfy.topic' => 'trove-alerts',
'services.ntfy.threshold' => 3,
]);
$instance = Instance::factory()
->type(InstanceType::Mastodon)
->enabled()
->create([
'url' => 'https://mastodon.social',
'consecutive_poll_failures' => 2, // will become 3 = at threshold
]);
$service = new PollAlertService;
$service->recordFailure($instance, 'connection refused after 3 retries');
Http::assertSent(function ($request) {
return str_contains($request->body(), 'https://mastodon.social')
&& str_contains($request->body(), 'connection refused after 3 retries');
});
}
}

View file

@ -0,0 +1,155 @@
<?php
declare(strict_types=1);
namespace Tests\Feature;
use App\Listeners\UrlDiscoveredListener;
use App\Models\Page;
use App\Models\PageLink;
use Carbon\CarbonImmutable;
use Illuminate\Events\CallQueuedListener;
use Illuminate\Foundation\Testing\RefreshDatabase;
use Illuminate\Support\Facades\Queue;
use Lvl0\FediDiscover\Config\InstanceType;
use Lvl0\FediDiscover\Events\UrlDiscovered;
use Lvl0\FediDiscover\Models\Instance;
use Tests\TestCase;
class UrlDiscoveryTest extends TestCase
{
use RefreshDatabase;
// ---------------------------------------------------------------------------
// Helpers
// ---------------------------------------------------------------------------
private function makeInstance(): Instance
{
return Instance::factory()
->type(InstanceType::Mastodon)
->enabled()
->create();
}
private function makeEvent(Instance $instance, array $overrides = []): UrlDiscovered
{
return new UrlDiscovered(
url: $overrides['url'] ?? 'https://example-blog.com/article',
instanceId: $overrides['instanceId'] ?? $instance->id,
discoveredAt: $overrides['discoveredAt'] ?? CarbonImmutable::parse('2026-04-26T12:00:00Z'),
postUrl: array_key_exists('postUrl', $overrides) ? $overrides['postUrl'] : 'https://mastodon.social/@alice/109876543210',
postBody: array_key_exists('postBody', $overrides) ? $overrides['postBody'] : 'check this out https://example-blog.com/article',
);
}
// ---------------------------------------------------------------------------
// Test 9 — happy path
// ---------------------------------------------------------------------------
public function test_listener_creates_target_page_and_source_page_with_link(): void
{
$instance = $this->makeInstance();
$discoveredAt = CarbonImmutable::parse('2026-04-26T12:00:00Z');
$event = new UrlDiscovered(
url: 'https://example-blog.com/article',
instanceId: $instance->id,
discoveredAt: $discoveredAt,
postUrl: 'https://mastodon.social/@alice/109876543210',
postBody: 'check this out https://example-blog.com/article',
);
event($event);
// Target page
$targetPage = Page::where('url', 'https://example-blog.com/article')->first();
$this->assertNotNull($targetPage);
// Source page
$sourcePage = Page::where('url', 'https://mastodon.social/@alice/109876543210')->first();
$this->assertNotNull($sourcePage);
// Edge
$link = PageLink::where('source_page_id', $sourcePage->id)
->where('target_page_id', $targetPage->id)
->first();
$this->assertNotNull($link);
}
// ---------------------------------------------------------------------------
// Test 10 — idempotency
// ---------------------------------------------------------------------------
public function test_listener_is_idempotent_on_repeated_event(): void
{
$instance = $this->makeInstance();
$event = $this->makeEvent($instance);
event($event);
event($event);
$this->assertSame(2, Page::count());
$this->assertSame(1, PageLink::count());
}
// ---------------------------------------------------------------------------
// Test 11 — null postUrl: only target page, no edge
// ---------------------------------------------------------------------------
public function test_listener_with_null_post_url_creates_only_target_page(): void
{
$instance = $this->makeInstance();
$event = $this->makeEvent($instance, ['postUrl' => null, 'postBody' => null]);
event($event);
$this->assertSame(1, Page::count());
$this->assertSame(0, PageLink::count());
$targetPage = Page::where('url', 'https://example-blog.com/article')->first();
$this->assertNotNull($targetPage);
}
// ---------------------------------------------------------------------------
// Integration — UrlDiscovered event enqueues crawls for both pages via observer
// ---------------------------------------------------------------------------
public function test_url_discovered_event_enqueues_crawls_via_observer(): void
{
$instance = $this->makeInstance();
$event = new UrlDiscovered(
url: 'https://example-blog.com/article',
instanceId: $instance->id,
discoveredAt: CarbonImmutable::parse('2026-04-26T12:00:00Z'),
postUrl: 'https://mastodon.social/@alice/109876543210',
postBody: 'check this out https://example-blog.com/article',
);
event($event);
// Listener creates 2 pages (target + source); observer fires for each → 2 crawl rows
$this->assertDatabaseCount('page_crawls', 2);
$this->assertDatabaseHas('page_crawls', ['domain' => 'example-blog.com']);
$this->assertDatabaseHas('page_crawls', ['domain' => 'mastodon.social']);
}
// ---------------------------------------------------------------------------
// Test 12 — listener is queued, not run inline
// ---------------------------------------------------------------------------
public function test_listener_is_pushed_to_queue_not_run_inline(): void
{
Queue::fake();
$instance = $this->makeInstance();
$event = $this->makeEvent($instance);
event($event);
Queue::assertPushed(CallQueuedListener::class, function (CallQueuedListener $job): bool {
return $job->class === UrlDiscoveredListener::class;
});
}
}

View file

@ -0,0 +1,158 @@
<?php
declare(strict_types=1);
namespace Tests\Feature;
use App\Enums\PageStatusEnum;
use App\Livewire\UrlSubmissionForm;
use App\Models\Page;
use Illuminate\Foundation\Testing\RefreshDatabase;
use Livewire\Livewire;
use PHPUnit\Framework\Attributes\DataProvider;
use Tests\TestCase;
class UrlSubmissionTest extends TestCase
{
use RefreshDatabase;
// -------------------------------------------------------------------------
// Test 1 — route renders the submission form
// -------------------------------------------------------------------------
public function test_submission_form_renders_at_public_route(): void
{
$response = $this->get('/submit');
$response->assertStatus(200);
$response->assertSeeLivewire('url-submission-form');
}
// -------------------------------------------------------------------------
// Test 2 — valid submission creates a page row as Discovered
// -------------------------------------------------------------------------
public function test_valid_url_submission_creates_page_as_discovered(): void
{
Livewire::test(UrlSubmissionForm::class)
->set('url', 'https://example.com/interesting-post')
->call('submit')
->assertHasNoErrors();
$this->assertDatabaseHas('pages', [
'url' => 'https://example.com/interesting-post',
]);
}
// -------------------------------------------------------------------------
// Test 3 — duplicate submission is idempotent (no second row created)
// -------------------------------------------------------------------------
public function test_duplicate_url_submission_does_not_create_second_page(): void
{
$url = 'https://example.com/seen-before';
Page::factory()->create([
'url' => $url,
'status' => PageStatusEnum::Discovered,
]);
Livewire::test(UrlSubmissionForm::class)
->set('url', $url)
->call('submit')
->assertHasNoErrors();
$this->assertDatabaseCount('pages', 1);
}
// -------------------------------------------------------------------------
// Test 4 — confirmation state echoes submitted URL
// -------------------------------------------------------------------------
public function test_confirmation_state_echoes_submitted_url(): void
{
$url = 'https://example.com/great-article';
Livewire::test(UrlSubmissionForm::class)
->set('url', $url)
->call('submit')
->assertHasNoErrors()
->assertSet('confirmedUrl', $url)
->assertSet('url', '')
->assertSee($url);
}
// -------------------------------------------------------------------------
// Test 5 — empty URL fails validation (regression lock)
// -------------------------------------------------------------------------
public function test_missing_url_fails_validation(): void
{
Livewire::test(UrlSubmissionForm::class)
->set('url', '')
->call('submit')
->assertHasErrors(['url' => 'required']);
}
// -------------------------------------------------------------------------
// Test 6 — invalid URL formats fail validation
// -------------------------------------------------------------------------
#[DataProvider('invalidUrls')]
public function test_invalid_url_formats_fail_validation(string $url): void
{
Livewire::test(UrlSubmissionForm::class)
->set('url', $url)
->call('submit')
->assertHasErrors('url');
}
public static function invalidUrls(): array
{
return [
'no scheme' => ['not-a-url'],
'disallowed scheme' => ['ftp://example.com'],
'javascript scheme' => ['javascript:alert(1)'],
];
}
// -------------------------------------------------------------------------
// Integration — form submission enqueues a crawl via PageObserver
// -------------------------------------------------------------------------
public function test_url_submission_form_enqueues_crawl_via_observer(): void
{
Livewire::test(UrlSubmissionForm::class)
->set('url', 'https://example.com/article')
->call('submit')
->assertHasNoErrors();
$this->assertDatabaseCount('page_crawls', 1);
$this->assertDatabaseHas('page_crawls', ['domain' => 'example.com']);
}
// -------------------------------------------------------------------------
// Test 7 — rate limit blocks the 11th submission within a minute
// -------------------------------------------------------------------------
public function test_rate_limit_blocks_eleventh_submission_within_a_minute(): void
{
// 10 submissions within the limit — each must succeed
for ($i = 1; $i <= 10; $i++) {
Livewire::test(UrlSubmissionForm::class)
->set('url', "https://example.com/post-{$i}")
->call('submit')
->assertHasNoErrors();
}
// 11th submission from the same IP must be blocked, with the message visible
Livewire::test(UrlSubmissionForm::class)
->set('url', 'https://example.com/post-11')
->call('submit')
->assertHasErrors('rate_limit')
->assertSee('Too many submissions');
// The 11th URL must NOT have been persisted
$this->assertDatabaseCount('pages', 10);
}
}

View file

@ -0,0 +1,83 @@
<?php
declare(strict_types=1);
namespace Tests\Unit\Actions;
use App\Actions\RegisterDiscoveredPageAction;
use App\Enums\PageStatusEnum;
use App\Models\Page;
use Illuminate\Foundation\Testing\RefreshDatabase;
use Lvl0\FediDiscover\Config\InstanceType;
use Lvl0\FediDiscover\Models\Instance;
use Tests\TestCase;
class RegisterDiscoveredPageActionTest extends TestCase
{
use RefreshDatabase;
public function test_creates_page_with_url_and_discovered_status(): void
{
$action = new RegisterDiscoveredPageAction;
$page = $action('https://example.com/article');
$this->assertInstanceOf(Page::class, $page);
$this->assertSame('https://example.com/article', $page->url);
$this->assertSame(PageStatusEnum::Discovered, $page->status);
$this->assertNull($page->instance_id);
$this->assertDatabaseHas('pages', ['url' => 'https://example.com/article']);
}
public function test_creates_page_with_provided_instance_id(): void
{
$instance = Instance::factory()
->type(InstanceType::Mastodon)
->enabled()
->create();
$action = new RegisterDiscoveredPageAction;
$page = $action('https://example.com/fediverse-post', instanceId: $instance->id);
$this->assertInstanceOf(Page::class, $page);
$this->assertSame($instance->id, $page->instance_id);
$this->assertDatabaseHas('pages', [
'url' => 'https://example.com/fediverse-post',
'instance_id' => $instance->id,
]);
}
public function test_returns_existing_page_when_url_already_exists(): void
{
$existing = Page::factory()->createQuietly([
'url' => 'https://example.com/seen-before',
'status' => PageStatusEnum::Discovered,
]);
$action = new RegisterDiscoveredPageAction;
$returned = $action('https://example.com/seen-before');
$this->assertSame($existing->id, $returned->id);
$this->assertDatabaseCount('pages', 1);
}
public function test_existing_page_status_not_overwritten_on_duplicate_call(): void
{
Page::factory()->createQuietly([
'url' => 'https://example.com/already-fetched',
'status' => PageStatusEnum::Fetched,
]);
$action = new RegisterDiscoveredPageAction;
$returned = $action('https://example.com/already-fetched');
$this->assertSame(PageStatusEnum::Fetched, $returned->status);
$this->assertDatabaseHas('pages', [
'url' => 'https://example.com/already-fetched',
'status' => PageStatusEnum::Fetched,
]);
}
}

View file

@ -0,0 +1,75 @@
<?php
declare(strict_types=1);
namespace Tests\Unit\Enums;
use App\Enums\CrawlOutcomeEnum;
use App\Enums\PageStatusEnum;
use Tests\TestCase;
class CrawlOutcomeEnumTest extends TestCase
{
public function test_all_expected_cases_exist_with_correct_backing_values(): void
{
$expected = [
'Success' => 'success',
'Failed' => 'failed',
'Timeout' => 'timeout',
'BlockedRobots' => 'blocked_robots',
'Blocked4xx' => 'blocked_4xx',
'Blocked5xx' => 'blocked_5xx',
'Rejected' => 'rejected',
];
foreach ($expected as $caseName => $backingValue) {
$case = CrawlOutcomeEnum::from($backingValue);
$this->assertSame($caseName, $case->name, "Case name for '{$backingValue}' should be '{$caseName}'");
$this->assertSame($backingValue, $case->value, "Backing value for '{$caseName}' should be '{$backingValue}'");
}
}
public function test_enum_has_exactly_seven_cases(): void
{
$this->assertCount(7, CrawlOutcomeEnum::cases());
}
public function test_to_page_status_maps_each_outcome_correctly(): void
{
$this->assertSame(PageStatusEnum::Fetched, CrawlOutcomeEnum::Success->toPageStatus());
$this->assertSame(PageStatusEnum::Rejected, CrawlOutcomeEnum::Rejected->toPageStatus());
$this->assertSame(PageStatusEnum::Failed, CrawlOutcomeEnum::Failed->toPageStatus());
$this->assertSame(PageStatusEnum::Failed, CrawlOutcomeEnum::Timeout->toPageStatus());
$this->assertSame(PageStatusEnum::Failed, CrawlOutcomeEnum::Blocked4xx->toPageStatus());
$this->assertSame(PageStatusEnum::Failed, CrawlOutcomeEnum::Blocked5xx->toPageStatus());
$this->assertSame(PageStatusEnum::Failed, CrawlOutcomeEnum::BlockedRobots->toPageStatus());
}
public function test_is_retryable_returns_true_only_for_transient_failures(): void
{
// Retryable: transient network/server problems that may resolve later
$this->assertTrue(CrawlOutcomeEnum::Failed->isRetryable());
$this->assertTrue(CrawlOutcomeEnum::Timeout->isRetryable());
$this->assertTrue(CrawlOutcomeEnum::Blocked5xx->isRetryable());
// Not retryable: success (done), permanent failures, or policy decisions
$this->assertFalse(CrawlOutcomeEnum::Success->isRetryable());
$this->assertFalse(CrawlOutcomeEnum::Rejected->isRetryable());
$this->assertFalse(CrawlOutcomeEnum::BlockedRobots->isRetryable());
$this->assertFalse(CrawlOutcomeEnum::Blocked4xx->isRetryable());
}
public function test_should_register_outbound_links_returns_true_only_for_success(): void
{
$this->assertTrue(CrawlOutcomeEnum::Success->shouldRegisterOutboundLinks());
// No links to register on any non-Success outcome
$this->assertFalse(CrawlOutcomeEnum::Failed->shouldRegisterOutboundLinks());
$this->assertFalse(CrawlOutcomeEnum::Timeout->shouldRegisterOutboundLinks());
$this->assertFalse(CrawlOutcomeEnum::Rejected->shouldRegisterOutboundLinks());
$this->assertFalse(CrawlOutcomeEnum::BlockedRobots->shouldRegisterOutboundLinks());
$this->assertFalse(CrawlOutcomeEnum::Blocked4xx->shouldRegisterOutboundLinks());
$this->assertFalse(CrawlOutcomeEnum::Blocked5xx->shouldRegisterOutboundLinks());
}
}

View file

@ -0,0 +1,33 @@
<?php
declare(strict_types=1);
namespace Tests\Unit\Enums;
use App\Enums\PageStatusEnum;
use Tests\TestCase;
class PageStatusEnumTest extends TestCase
{
public function test_all_expected_cases_exist_with_correct_backing_values(): void
{
$expected = [
'Discovered' => 'discovered',
'Fetched' => 'fetched',
'Failed' => 'failed',
'Rejected' => 'rejected',
];
foreach ($expected as $caseName => $backingValue) {
$case = PageStatusEnum::from($backingValue);
$this->assertSame($caseName, $case->name, "Case name for '{$backingValue}' should be '{$caseName}'");
$this->assertSame($backingValue, $case->value, "Backing value for '{$caseName}' should be '{$backingValue}'");
}
}
public function test_enum_has_exactly_four_cases(): void
{
$this->assertCount(4, PageStatusEnum::cases());
}
}

View file

@ -1,16 +0,0 @@
<?php
namespace Tests\Unit;
use PHPUnit\Framework\TestCase;
class ExampleTest extends TestCase
{
/**
* A basic test example.
*/
public function test_that_true_is_true(): void
{
$this->assertTrue(true);
}
}

View file

@ -0,0 +1,42 @@
<?php
declare(strict_types=1);
namespace Tests\Unit\Models;
use App\Enums\CrawlOutcomeEnum;
use App\Models\Page;
use App\Models\PageCrawl;
use Carbon\Carbon;
use Illuminate\Foundation\Testing\RefreshDatabase;
use Illuminate\Support\Facades\Queue;
use Tests\TestCase;
class PageCrawlFactoryTest extends TestCase
{
use RefreshDatabase;
public function test_factory_successful_state_produces_success_outcome(): void
{
Queue::fake();
$page = Page::factory()->create();
$crawl = PageCrawl::factory()->page($page)->successful()->create();
$this->assertSame(CrawlOutcomeEnum::Success, $crawl->outcome);
$this->assertInstanceOf(Carbon::class, $crawl->completed_at);
$this->assertNull($crawl->error_message);
}
public function test_factory_failed_state_produces_failed_outcome_with_message(): void
{
Queue::fake();
$page = Page::factory()->create();
$crawl = PageCrawl::factory()->page($page)->failed('Connection timed out')->create();
$this->assertSame(CrawlOutcomeEnum::Failed, $crawl->outcome);
$this->assertInstanceOf(Carbon::class, $crawl->completed_at);
$this->assertSame('Connection timed out', $crawl->error_message);
}
}

View file

@ -0,0 +1,111 @@
<?php
declare(strict_types=1);
namespace Tests\Unit\Models;
use App\Enums\CrawlOutcomeEnum;
use App\Models\Page;
use App\Models\PageCrawl;
use Carbon\Carbon;
use Illuminate\Foundation\Testing\RefreshDatabase;
use Illuminate\Support\Facades\Queue;
use Tests\TestCase;
class PageCrawlTest extends TestCase
{
use RefreshDatabase;
public function test_page_crawl_fillable_fields_persist_and_casts_are_applied(): void
{
Queue::fake();
$page = Page::factory()->createQuietly(['url' => 'https://example.com/page-1']);
$completedAt = Carbon::parse('2026-05-01 10:01:05');
$crawl = PageCrawl::create([
'page_id' => $page->id,
'domain' => 'example.com',
'priority' => 5,
'completed_at' => $completedAt,
'outcome' => CrawlOutcomeEnum::Success,
'status_code' => 200,
'error_message' => null,
]);
$fresh = $crawl->fresh();
$this->assertNotNull($fresh);
// domain / priority round-trip
$this->assertSame('example.com', $fresh->domain);
$this->assertSame(5, $fresh->priority);
// outcome is cast to the enum
$this->assertInstanceOf(CrawlOutcomeEnum::class, $fresh->outcome);
$this->assertSame(CrawlOutcomeEnum::Success, $fresh->outcome);
// datetime casts
$this->assertInstanceOf(Carbon::class, $fresh->completed_at);
$this->assertTrue($completedAt->equalTo($fresh->completed_at));
// nullable columns
$this->assertNull($fresh->error_message);
// status_code persists
$this->assertSame(200, $fresh->status_code);
}
public function test_page_crawl_belongs_to_a_page(): void
{
$page = Page::factory()->createQuietly(['url' => 'https://example.com/page-2']);
$crawl = PageCrawl::create([
'page_id' => $page->id,
'domain' => 'example.com',
'priority' => 1,
]);
$related = $crawl->page;
$this->assertInstanceOf(Page::class, $related);
$this->assertSame($page->id, $related->id);
}
public function test_deleting_a_page_cascades_to_its_page_crawls(): void
{
// createQuietly() skips the PageObserver so the count of explicit rows is predictable;
// this test is about cascade delete behaviour, not observer side effects.
$page = Page::factory()->createQuietly(['url' => 'https://example.com/page-cascade']);
PageCrawl::factory()->page($page)->create();
PageCrawl::factory()->page($page)->successful()->create();
PageCrawl::factory()->page($page)->failed('timeout during fetch')->create();
$this->assertSame(3, PageCrawl::count());
$page->delete();
$this->assertSame(0, PageCrawl::count());
}
public function test_pending_crawls_are_filtered_by_null_outcome(): void
{
Queue::fake();
// createQuietly() skips the PageObserver; this test counts rows with null/non-null
// outcome — the auto-inserted observer crawl (outcome=null) would corrupt both counts.
$page = Page::factory()->createQuietly(['url' => 'https://example.com/page-pending']);
$pending = PageCrawl::factory()->page($page)->create();
PageCrawl::factory()->page($page)->successful()->create();
PageCrawl::factory()->page($page)->failed('connection refused')->create();
$this->assertSame(1, PageCrawl::whereNull('outcome')->count());
$this->assertSame($pending->id, PageCrawl::whereNull('outcome')->first()->id);
$this->assertSame(2, PageCrawl::whereNotNull('outcome')->count());
}
}

Some files were not shown because too many files have changed in this diff Show more