diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..88f6a9a --- /dev/null +++ b/.dockerignore @@ -0,0 +1,49 @@ +# Version control +.git +.gitignore +.gitattributes + +# Dev environment +shell.nix +Dockerfile.dev +docker/ + +# Tests (not needed in prod image) +tests/ +phpunit.xml +.phpunit.result.cache +phpstan.neon + +# Dependencies (rebuilt during image build) +node_modules/ +vendor/ + +# Build artifacts (frontend stage produces these) +public/build/ +public/hot + +# Editor / OS +.editorconfig +.idea/ +.vscode/ +.DS_Store +*.swp +*.swo + +# Env / secrets +.env +.env.* +!.env.example + +# Logs and runtime caches +storage/logs/*.log +storage/framework/cache/data/ +storage/framework/sessions/ +storage/framework/views/ + +# CI +.forgejo/ + +# Docs / project meta +README.md +LICENSE diff --git a/.env.example b/.env.example index 79935dc..ac89b76 100644 --- a/.env.example +++ b/.env.example @@ -61,3 +61,9 @@ AWS_BUCKET= AWS_USE_PATH_STYLE_ENDPOINT=false VITE_APP_NAME="${APP_NAME}" + +CRAWLER_MIN_DOMAIN_DELAY_SECONDS=10 + +NTFY_URL= +NTFY_TOPIC= +NTFY_THRESHOLD= diff --git a/.forgejo/workflows/build.yml b/.forgejo/workflows/build.yml index 53d63b2..3676bb3 100644 --- a/.forgejo/workflows/build.yml +++ b/.forgejo/workflows/build.yml @@ -5,8 +5,7 @@ on: branches: [main] tags: ['v*'] paths: - - 'Dockerfile' - - 'docker/**' + - 'docker/prod/Dockerfile' - 'app/**' - 'bootstrap/**' - 'config/**' @@ -51,6 +50,6 @@ jobs: uses: https://data.forgejo.org/docker/build-push-action@v5 with: context: . - file: Dockerfile + file: docker/prod/Dockerfile push: true tags: ${{ steps.meta.outputs.tags }} diff --git a/README.md b/README.md index 4f73b2a..4fbbe84 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,128 @@ -# trove +# Trove -A small web search engine. +A federated search engine for the small web. Seeded by fediverse attention, ranked by domain coherence rather than commercial authority. + +## Tech stack + +Laravel 13 · Livewire 4 · PostgreSQL 17 (tsvector FTS) · Redis 7 · FrankenPHP · Vite 8 · Tailwind 4. + +## Local development + +Requires [Nix](https://nixos.org/download/) and [Podman](https://podman.io/). + +```sh +nix-shell # enter dev shell +dev-up # start app, db, redis +``` + +App: `http://localhost:8200` · Vite HMR: `http://localhost:5175` + +Other helpers inside the nix shell: `dev-down`, `dev-rebuild`, `dev-shell`, `dev-artisan `, `dev-logs`. + +## Self-hosting + +Trove ships as a Docker image published to `forge.lvl0.xyz/lvl0/trove`. You provide the compose/stack config. + +### Required environment + +| Variable | Purpose | +|---|---| +| `APP_KEY` | Laravel app key. Generate with `docker run --rm forge.lvl0.xyz/lvl0/trove:latest php artisan key:generate --show`. **Must persist across deployments** or sessions/encrypted data break. | +| `APP_URL` | Public URL, e.g. `https://trove.example.org` | +| `DB_DATABASE`, `DB_USERNAME`, `DB_PASSWORD` | PostgreSQL credentials | +| `DB_HOST` | Hostname of the PostgreSQL service. Default `db`. Override if your service is named differently. | +| `REDIS_HOST` | Hostname of the Redis service. Default `redis`. Override if your service is named differently. | + +### Services you need to provide + +- **App**: pull `forge.lvl0.xyz/lvl0/trove:latest` (or a pinned `v*` tag). Exposes port `8000` inside the container. The image runs migrations and warms caches on boot. +- **Worker**: same image as `app`, with `command: php artisan queue:work --tries=3 --max-time=3600`. Processes the crawler queue (URL fetching, content extraction, retries). Crawls won't actually run without this — `app` only enqueues work. **Required for the crawler to function.** +- **PostgreSQL 17**. Hostname must be reachable as `db` (default) or set `DB_HOST`. Persist `/var/lib/postgresql/data`. +- **Redis 7** with `--appendonly yes` (queue jobs persist across restarts). Hostname `redis` or set `REDIS_HOST`. + +On first boot the startup script waits for PostgreSQL, warms caches, then runs `php artisan migrate --force` automatically. The 60-second wait loop covers slow PG init; it exits with a clear error if PG never becomes reachable. + +### Volumes + +- `/app/storage` — Laravel writable paths (logs, cached views, uploads). Persist this. + +### Healthcheck + +The image exposes `GET /up` (Laravel's built-in health route). The Dockerfile declares a HEALTHCHECK; your orchestrator can use `curl -fsS http://localhost:8000/up` for liveness. + +### Example compose stack + +A minimal reference — adapt for your infra. DockGE, Portainer, `docker compose`, Kubernetes, and bare `podman play kube` all work with equivalent configs. + +```yaml +services: + app: + image: forge.lvl0.xyz/lvl0/trove:latest + restart: always + ports: ["${APP_PORT:-8400}:8000"] + environment: + APP_KEY: "${APP_KEY}" + APP_URL: "${APP_URL}" + DB_DATABASE: "${DB_DATABASE}" + DB_USERNAME: "${DB_USERNAME}" + DB_PASSWORD: "${DB_PASSWORD}" + volumes: + - app_storage:/app/storage + depends_on: + db: { condition: service_healthy } + redis: { condition: service_healthy } + + worker: + image: forge.lvl0.xyz/lvl0/trove:latest + restart: always + command: php artisan queue:work --tries=3 --max-time=3600 + environment: + APP_KEY: "${APP_KEY}" + APP_URL: "${APP_URL}" + DB_DATABASE: "${DB_DATABASE}" + DB_USERNAME: "${DB_USERNAME}" + DB_PASSWORD: "${DB_PASSWORD}" + volumes: + - app_storage:/app/storage + depends_on: + db: { condition: service_healthy } + redis: { condition: service_healthy } + + db: + image: postgres:17-alpine + restart: always + environment: + POSTGRES_DB: "${DB_DATABASE}" + POSTGRES_USER: "${DB_USERNAME}" + POSTGRES_PASSWORD: "${DB_PASSWORD}" + volumes: + - db_data:/var/lib/postgresql/data + healthcheck: + test: ["CMD-SHELL", "pg_isready -U $$POSTGRES_USER -d $$POSTGRES_DB"] + interval: 10s + retries: 5 + start_period: 10s + + redis: + image: redis:7-alpine + restart: always + command: redis-server --appendonly yes + volumes: + - redis_data:/data + healthcheck: + test: ["CMD", "redis-cli", "ping"] + interval: 10s + retries: 5 + +volumes: + db_data: + redis_data: + app_storage: +``` + +### Upgrades + +Pull the new image tag, recreate the app container. Migrations run on boot (`php artisan migrate --force` in the startup script). Rollback by pointing at the previous `v*` tag. ---- diff --git a/app/Actions/FetchPageAction.php b/app/Actions/FetchPageAction.php new file mode 100644 index 0000000..e906b15 --- /dev/null +++ b/app/Actions/FetchPageAction.php @@ -0,0 +1,194 @@ +http + ->timeout(config('crawler.timeout')) + ->withHeaders([ + 'User-Agent' => config('crawler.user_agent'), + 'Accept' => 'text/html', + ]) + ->withOptions([ + 'allow_redirects' => ['max' => config('crawler.max_redirects')], + ]) + ->get($url); + + } catch (ConnectionException|ConnectException $e) { + return $this->failureResult($e); + } + + [$outcome, $error] = $this->validateResponse($response); + + if ($outcome === CrawlOutcomeEnum::Success) { + [$title, $extractedText, $links, $crawler] = $this->extractTitleTextAndLinks($response->body(), $url); + $wordCount = $extractedText !== '' ? count(preg_split('/\s+/u', trim($extractedText))) : 0; + [$language, $languageConfidence] = $this->detectLanguage($crawler, $extractedText, $wordCount); + } + + return new FetchResult( + outcome: $outcome, + statusCode: $response->status(), + finalUrl: $url, + title: $title ?? null, + extractedText: $extractedText ?? null, + outboundLinks: $links ?? collect(), + wordCount: $wordCount ?? null, + errorMessage: $error ?? null, + language: $language ?? null, + languageConfidence: $languageConfidence ?? null, + ); + } + + private function validateResponse(Response $response): array + { + $status = $response->status(); + + if ($status >= 400 && $status < 500) { + return [CrawlOutcomeEnum::Blocked4xx, "HTTP {$status}"]; + } + + if ($status >= 500) { + return [CrawlOutcomeEnum::Blocked5xx, "HTTP {$status}"]; + } + + $contentType = $response->header('Content-Type'); + if (! str_starts_with(mb_strtolower($contentType), 'text/html')) { + return [CrawlOutcomeEnum::Rejected, "Unsupported file type: {$contentType}"]; + } + + return [CrawlOutcomeEnum::Success, null]; + } + + private function failureResult(ConnectionException|ConnectException $e): FetchResult + { + $guzzleException = $e instanceof ConnectException + ? $e + : ($e->getPrevious() instanceof ConnectException + ? $e->getPrevious() + : null); + + $errno = $guzzleException?->getHandlerContext()['errno'] ?? null; + + $outcome = $errno === CURLE_OPERATION_TIMEDOUT + ? CrawlOutcomeEnum::Timeout + : CrawlOutcomeEnum::Failed; + + return new FetchResult( + outcome: $outcome, + statusCode: null, + finalUrl: null, + title: null, + extractedText: null, + outboundLinks: collect(), + wordCount: null, + errorMessage: $e->getMessage(), + ); + } + + private function extractTitleTextAndLinks(string $body, string $url): array + { + $crawler = new Crawler($body); + + $title = $crawler->filter('title')->count() > 0 + ? trim($crawler->filter('title')->text()) + : null; + + $readability = new Readability(new Configuration); + $readability->parse($body); + $mainContent = $readability->getContent() ?? ''; + $extractedText = trim(strip_tags($mainContent)); + + $links = collect(); + if ($mainContent !== '') { + $linkCrawler = new Crawler($mainContent); + if ($linkCrawler->filter('a[href]')->count() > 0) { + $links = collect($linkCrawler->filter('a[href]')->extract(['href'])); + } + } + + $linksResolved = $links + ->map(fn (string $href) => $this->resolveAndValidateLink($href, $url)) + ->filter() + ->unique() + ->values(); + + return [$title, $extractedText, $linksResolved, $crawler]; + } + + private function resolveAndValidateLink(string $href, string $finalUrl): ?string + { + try { + $resolved = (string) BaseUri::from($finalUrl)->resolve($href); + $resolved = strstr($resolved, '#', true) ?: $resolved; + } catch (Throwable) { + return null; + } + + if ($resolved === $finalUrl) { + return null; + } + + try { + $this->urlService->host($resolved); + } catch (InvalidArgumentException) { + return null; + } + + return $resolved; + } + + /** + * @return array{0: ?string, 1: ?float} + */ + private function detectLanguage(Crawler $crawler, string $extractedText, ?int $wordCount = null): array + { + if ($wordCount >= self::MIN_WORDS_FOR_TEXT_DETECTION) { + $result = $this->languageDetection->detect($extractedText); + if ($result !== null && $result[1] >= self::MIN_TEXT_DETECTION_CONFIDENCE) { + return [$result[0], $result[1]]; + } + } + + $lang = $crawler->filter('html')->count() > 0 + ? trim($crawler->filter('html')->attr('lang') ?? '') + : ''; + + if ($lang !== '' && strlen($lang) <= 35) { + return [$lang, 1.0]; + } + + return [null, null]; + } +} diff --git a/app/Actions/RegisterDiscoveredPageAction.php b/app/Actions/RegisterDiscoveredPageAction.php new file mode 100644 index 0000000..840e52c --- /dev/null +++ b/app/Actions/RegisterDiscoveredPageAction.php @@ -0,0 +1,22 @@ + $url], + [ + 'status' => PageStatusEnum::Discovered, + 'instance_id' => $instanceId, + ], + ); + } +} diff --git a/app/Enums/CrawlOutcomeEnum.php b/app/Enums/CrawlOutcomeEnum.php new file mode 100644 index 0000000..582fdc9 --- /dev/null +++ b/app/Enums/CrawlOutcomeEnum.php @@ -0,0 +1,60 @@ + PageStatusEnum::Fetched, + self::Rejected => PageStatusEnum::Rejected, + self::Failed, + self::Timeout, + self::BlockedRobots, + self::Blocked4xx, + self::Blocked5xx => PageStatusEnum::Failed, + }; + } + + /** + * True if the worker should retry this outcome (transient failures only). + * Permanent failures (4xx, robots block, rejected content type) and successes do not retry. + */ + public function isRetryable(): bool + { + return match ($this) { + self::Failed, self::Timeout, self::Blocked5xx => true, + self::Success, self::Rejected, self::BlockedRobots, self::Blocked4xx => false, + }; + } + + /** + * True if the worker should register the outbound links discovered during the fetch. + * Only Success outcomes have meaningful links; everything else either failed or returned no usable HTML. + */ + public function shouldRegisterOutboundLinks(): bool + { + return $this === self::Success; + } +} diff --git a/app/Enums/PageStatusEnum.php b/app/Enums/PageStatusEnum.php new file mode 100644 index 0000000..84bee4c --- /dev/null +++ b/app/Enums/PageStatusEnum.php @@ -0,0 +1,20 @@ + fn ($q) => $q->where('status', PageStatusEnum::Failed), + ])->orderBy('url', 'asc')->get(); + + return view('admin.index', ['instances' => $instances]); + } +} diff --git a/app/Jobs/ProcessCrawlJob.php b/app/Jobs/ProcessCrawlJob.php new file mode 100644 index 0000000..2c15b0c --- /dev/null +++ b/app/Jobs/ProcessCrawlJob.php @@ -0,0 +1,127 @@ +isAllowed($this->pageCrawl->page->url)) { + $this->pageCrawl->update([ + 'outcome' => CrawlOutcomeEnum::BlockedRobots, + 'completed_at' => now(), + ]); + $this->pageCrawl->page->update(['status' => PageStatusEnum::Failed]); + + return; + } + + $fetcher = resolve(FetchPageAction::class); + $register = resolve(RegisterDiscoveredPageAction::class); + $politenessService = resolve(PolitenessService::class); + + $delay = $politenessService->minDelayFor($this->pageCrawl->domain); + $lock = Cache::lock("crawler:domain:{$this->pageCrawl->domain}", $delay); + + if (! $lock->get()) { + $this->release($delay); + + return; + } + + $result = $fetcher($this->pageCrawl->page->url); + + $this->writeOutcome($result); + $this->updatePageStatus($result); + + if ($result->outcome->shouldRegisterOutboundLinks()) { + $result->outboundLinks->each(fn (string $url) => $register($url)); + } + + if ($result->outcome->isRetryable()) { + $this->scheduleRetryIfNeeded(); + } + } + + private function writeOutcome(FetchResult $result): void + { + $this->pageCrawl->update([ + 'outcome' => $result->outcome, + 'completed_at' => now(), + 'status_code' => $result->statusCode, + 'error_message' => $result->errorMessage, + ]); + } + + private function updatePageStatus(FetchResult $result): void + { + $status = $result->outcome->toPageStatus(); + + $update = match ($status) { + PageStatusEnum::Fetched => [ + 'status' => $status, + 'fetched_at' => now(), + 'title' => $result->title, + // Sticky language: only write when detection produced a value, so a re-crawl + // returning null doesn't erase a previously-detected language. Guarding on + // language alone is sufficient because FetchPageAction::detectLanguage() + // always returns the pair as both-null or both-non-null (never mixed). + ...($result->language !== null ? [ + 'language' => $result->language, + 'language_confidence' => $result->languageConfidence, + ] : []), + ], + PageStatusEnum::Failed => [ + 'status' => $status, + 'failed_at' => now(), + ], + PageStatusEnum::Rejected => [ + 'status' => $status, + ], + PageStatusEnum::Discovered => [ + 'status' => $status, + ], + }; + + $this->pageCrawl->page->update($update); + } + + private function scheduleRetryIfNeeded(): void + { + if (PageCrawl::where('page_id', $this->pageCrawl->page_id)->count() >= 3) { + return; + } + + $newRow = PageCrawl::withoutEvents( + fn () => PageCrawl::create( + array_merge($this->pageCrawl->toArray(), [ + 'outcome' => null, + ]) + ) + ); + + ProcessCrawlJob::dispatch($newRow)->delay(now()->addHour()); + } +} diff --git a/app/Listeners/PollFailedListener.php b/app/Listeners/PollFailedListener.php new file mode 100644 index 0000000..e501ff7 --- /dev/null +++ b/app/Listeners/PollFailedListener.php @@ -0,0 +1,18 @@ +service->recordFailure($event->instance, $event->message); + } +} diff --git a/app/Listeners/UrlDiscoveredListener.php b/app/Listeners/UrlDiscoveredListener.php new file mode 100644 index 0000000..67b4f1f --- /dev/null +++ b/app/Listeners/UrlDiscoveredListener.php @@ -0,0 +1,36 @@ +registerPage)($event->url, $event->instanceId); + + if ($event->postUrl === null || $event->postUrl === $event->url) { + return; + } + + $sourcePage = ($this->registerPage)($event->postUrl, $event->instanceId); + + PageLink::firstOrCreate([ + 'source_page_id' => $sourcePage->id, + 'target_page_id' => $targetPage->id, + ]); + }); + } +} diff --git a/app/Livewire/UrlSubmissionForm.php b/app/Livewire/UrlSubmissionForm.php new file mode 100644 index 0000000..8c1b11e --- /dev/null +++ b/app/Livewire/UrlSubmissionForm.php @@ -0,0 +1,44 @@ +ip(); + + if (RateLimiter::tooManyAttempts($key, 10)) { + $this->addError('rate_limit', 'Too many submissions, try again shortly.'); + + return; + } + + RateLimiter::hit($key, 60); + + $validated = $this->validate([ + 'url' => ['required', 'url:http,https'], + ]); + + $registerPage($validated['url']); + + $this->confirmedUrl = $validated['url']; + $this->reset('url'); + } + + public function render(): View + { + return view('livewire.url-submission-form'); + } +} diff --git a/app/Models/Page.php b/app/Models/Page.php new file mode 100644 index 0000000..52a131c --- /dev/null +++ b/app/Models/Page.php @@ -0,0 +1,68 @@ + */ + use HasFactory; + + protected $fillable = [ + 'url', + 'status', + 'language', + 'language_confidence', + 'title', + 'instance_id', + 'posted_at', + 'fetched_at', + 'failed_at', + ]; + + protected $casts = [ + 'status' => PageStatusEnum::class, + 'language_confidence' => 'float', + 'posted_at' => 'datetime', + 'fetched_at' => 'datetime', + 'failed_at' => 'datetime', + ]; + + public function instance(): BelongsTo + { + return $this->belongsTo(Instance::class); + } + + public function outgoingLinks(): HasMany + { + return $this->hasMany(PageLink::class, 'source_page_id'); + } + + public function incomingLinks(): HasMany + { + return $this->hasMany(PageLink::class, 'target_page_id'); + } + + public function crawls(): HasMany + { + return $this->hasMany(PageCrawl::class); + } + + public function latestCrawl(): HasOne + { + return $this->hasOne(PageCrawl::class)->latestOfMany('created_at'); + } +} diff --git a/app/Models/PageCrawl.php b/app/Models/PageCrawl.php new file mode 100644 index 0000000..ba2ba29 --- /dev/null +++ b/app/Models/PageCrawl.php @@ -0,0 +1,45 @@ + */ + use HasFactory; + + protected $fillable = [ + 'page_id', + 'domain', + 'priority', + 'completed_at', + 'outcome', + 'status_code', + 'error_message', + ]; + + protected $casts = [ + 'priority' => 'integer', + 'completed_at' => 'datetime', + 'outcome' => CrawlOutcomeEnum::class, + 'status_code' => 'integer', + ]; + + /** + * @return BelongsTo + */ + public function page(): BelongsTo + { + return $this->belongsTo(Page::class); + } +} diff --git a/app/Models/PageLink.php b/app/Models/PageLink.php new file mode 100644 index 0000000..a8e67f8 --- /dev/null +++ b/app/Models/PageLink.php @@ -0,0 +1,31 @@ + */ + use HasFactory; + + protected $fillable = [ + 'source_page_id', + 'target_page_id', + ]; + + public function sourcePage(): BelongsTo + { + return $this->belongsTo(Page::class, 'source_page_id'); + } + + public function targetPage(): BelongsTo + { + return $this->belongsTo(Page::class, 'target_page_id'); + } +} diff --git a/app/Observers/PageCrawlObserver.php b/app/Observers/PageCrawlObserver.php new file mode 100644 index 0000000..85a8517 --- /dev/null +++ b/app/Observers/PageCrawlObserver.php @@ -0,0 +1,14 @@ + $page->id], + [ + 'domain' => $this->urlService->host($page->url), + 'priority' => 0, + ], + ); + } +} diff --git a/app/Providers/AppServiceProvider.php b/app/Providers/AppServiceProvider.php index 452e6b6..dfb03cd 100644 --- a/app/Providers/AppServiceProvider.php +++ b/app/Providers/AppServiceProvider.php @@ -2,23 +2,24 @@ namespace App\Providers; +use App\Listeners\PollFailedListener; +use App\Listeners\UrlDiscoveredListener; +use App\Services\LanguageDetectionService; +use Illuminate\Support\Facades\Event; use Illuminate\Support\ServiceProvider; +use Lvl0\FediDiscover\Events\PollFailed; +use Lvl0\FediDiscover\Events\UrlDiscovered; class AppServiceProvider extends ServiceProvider { - /** - * Register any application services. - */ public function register(): void { - // + $this->app->singleton(LanguageDetectionService::class); } - /** - * Bootstrap any application services. - */ public function boot(): void { - // + Event::listen(UrlDiscovered::class, UrlDiscoveredListener::class); + Event::listen(PollFailed::class, PollFailedListener::class); } } diff --git a/app/Services/LanguageDetectionService.php b/app/Services/LanguageDetectionService.php new file mode 100644 index 0000000..2724ea0 --- /dev/null +++ b/app/Services/LanguageDetectionService.php @@ -0,0 +1,39 @@ +language = new Language; + } + + /** + * @return array{0: string, 1: float}|null + */ + public function detect(string $text): ?array + { + if (trim($text) === '') { + return null; + } + + $languages = $this->language->detect($text)->bestResults()->close(); + + if ($languages === []) { + return null; + } + + // bestResults() keeps every candidate within 0.025 of the top score. + // array_key_first picks the highest-ranked one (arsort'd by the library). + $code = array_key_first($languages); + + return [$code, $languages[$code]]; + } +} diff --git a/app/Services/PolitenessService.php b/app/Services/PolitenessService.php new file mode 100644 index 0000000..4d2b12b --- /dev/null +++ b/app/Services/PolitenessService.php @@ -0,0 +1,19 @@ +crawlDelayFor($domain, config('crawler.user_agent')); + + $configValue = config('crawler.min_domain_delay_seconds', 10); + + return max($crawlDelay ?? 0, $configValue); + } +} diff --git a/app/Services/PollAlertService.php b/app/Services/PollAlertService.php new file mode 100644 index 0000000..44185d1 --- /dev/null +++ b/app/Services/PollAlertService.php @@ -0,0 +1,38 @@ +increment('consecutive_poll_failures'); + $instance->refresh(); + + $ntfyUrl = config('services.ntfy.url'); + $ntfyThreshold = config('services.ntfy.threshold'); + $ntfyTopic = config('services.ntfy.topic'); + + if ($ntfyUrl === null || $ntfyThreshold === null || $ntfyThreshold === 0 || $ntfyTopic === null) { + return; + } + + if ($instance->consecutive_poll_failures < $ntfyThreshold) { + return; + } + + try { + Http::timeout(5) + ->withBody($instance->url . ' - ' . $message, 'text/plain') + ->post(rtrim($ntfyUrl, '/') . '/' . $ntfyTopic); + } catch (Exception $e) { + logger()->warning('ntfy alert failed', ['instance' => $instance->url, 'error' => $e->getMessage()]); + } + } +} diff --git a/app/Services/RobotsService.php b/app/Services/RobotsService.php new file mode 100644 index 0000000..f8b7f65 --- /dev/null +++ b/app/Services/RobotsService.php @@ -0,0 +1,60 @@ +urlService->host($url); + $path = parse_url($url, PHP_URL_PATH) ?? '/'; + + $body = Cache::remember( + "crawler:robots:{$host}", + config('crawler.robots_cache_ttl_seconds'), + function () use ($host) { + try { + $response = Http::get("https://{$host}/robots.txt"); + + return $response->successful() ? $response->body() : ''; + } catch (ConnectionException) { + return ''; + } + } + ); + + return (new RobotsTxt($body))->allows($path, $userAgent); + } + + public function crawlDelayFor(string $host, string $userAgent): ?int + { + $body = Cache::remember( + "crawler:robots:{$host}", + config('crawler.robots_cache_ttl_seconds'), + function () use ($host) { + try { + $response = Http::get("https://{$host}/robots.txt"); + + return $response->successful() ? $response->body() : ''; + } catch (ConnectionException) { + return ''; + } + } + ); + + $delay = (new RobotsTxt($body))->crawlDelay($userAgent); + + return $delay !== null ? (int) $delay : null; + } +} diff --git a/app/Services/UrlService.php b/app/Services/UrlService.php new file mode 100644 index 0000000..6b1700c --- /dev/null +++ b/app/Services/UrlService.php @@ -0,0 +1,40 @@ +scheme(); + if ($scheme === null || $scheme === '') { + throw new InvalidArgumentException("URL has no scheme: {$url}"); + } + if (! in_array($scheme, ['http', 'https'], true)) { + throw new InvalidArgumentException("Invalid URL scheme: {$scheme}"); + } + + if ($uri->user() !== null) { + throw new InvalidArgumentException("URLs with embedded credentials not allowed: {$url}"); + } + + $host = $uri->host(); + if ($host === null || $host === '') { + throw new InvalidArgumentException("URL has no host: {$url}"); + } + + $bareHost = preg_replace('/%.*$/', '', trim($host, '[]')); + if (filter_var($bareHost, FILTER_VALIDATE_IP) !== false) { + throw new InvalidArgumentException("IP literal hosts not allowed: {$host}"); + } + + return mb_strtolower($host); + } +} diff --git a/app/ValueObjects/FetchResult.php b/app/ValueObjects/FetchResult.php new file mode 100644 index 0000000..3514b37 --- /dev/null +++ b/app/ValueObjects/FetchResult.php @@ -0,0 +1,28 @@ + $outboundLinks + */ + public function __construct( + public CrawlOutcomeEnum $outcome, + public ?int $statusCode, + public ?string $finalUrl, + public ?string $title, + public ?string $extractedText, + public Collection $outboundLinks, + public ?int $wordCount, + public ?string $errorMessage, + public ?string $language = null, + public ?float $languageConfidence = null, + ) {} +} diff --git a/bootstrap/app.php b/bootstrap/app.php index c183276..27faf69 100644 --- a/bootstrap/app.php +++ b/bootstrap/app.php @@ -3,15 +3,20 @@ use Illuminate\Foundation\Application; use Illuminate\Foundation\Configuration\Exceptions; use Illuminate\Foundation\Configuration\Middleware; +use Illuminate\Http\Request; return Application::configure(basePath: dirname(__DIR__)) ->withRouting( - web: __DIR__.'/../routes/web.php', - commands: __DIR__.'/../routes/console.php', + web: __DIR__ . '/../routes/web.php', + commands: __DIR__ . '/../routes/console.php', health: '/up', ) ->withMiddleware(function (Middleware $middleware): void { - // + $middleware->trustProxies( + at: '*', + headers: Request::HEADER_X_FORWARDED_FOR + | Request::HEADER_X_FORWARDED_PROTO, + ); }) ->withExceptions(function (Exceptions $exceptions): void { // diff --git a/composer.json b/composer.json index dcb3aca..9af1143 100644 --- a/composer.json +++ b/composer.json @@ -16,10 +16,14 @@ ], "require": { "php": "^8.3", + "fivefilters/readability.php": "^3.3", "laravel/framework": "^13.0", "laravel/tinker": "^3.0", "livewire/livewire": "^4.2", - "lvl0/fedi-discover": "@dev" + "lvl0/fedi-discover": "@dev", + "patrickschur/language-detection": "^5.3", + "spatie/robots-txt": "^2.5", + "symfony/dom-crawler": "^7.4" }, "require-dev": { "fakerphp/faker": "^1.23", diff --git a/composer.lock b/composer.lock index 15b7993..51ecdd2 100644 --- a/composer.lock +++ b/composer.lock @@ -4,7 +4,7 @@ "Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies", "This file is @generated automatically" ], - "content-hash": "e46e58784ec34415557c78db6bb6c97e", + "content-hash": "4d6e239c94fea8e9511f1e73f05db1df", "packages": [ { "name": "brick/math", @@ -508,6 +508,71 @@ ], "time": "2025-03-06T22:45:56+00:00" }, + { + "name": "fivefilters/readability.php", + "version": "v3.3.3", + "source": { + "type": "git", + "url": "https://github.com/fivefilters/readability.php.git", + "reference": "e2ee7b9e49eae89ac7ed2c74b15718100a73b4c8" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/fivefilters/readability.php/zipball/e2ee7b9e49eae89ac7ed2c74b15718100a73b4c8", + "reference": "e2ee7b9e49eae89ac7ed2c74b15718100a73b4c8", + "shasum": "" + }, + "require": { + "ext-dom": "*", + "ext-mbstring": "*", + "ext-xml": "*", + "league/uri": "^7.0", + "masterminds/html5": "^2.0", + "php": ">=8.1", + "psr/log": "^1.0 || ^2.0 || ^3.0" + }, + "require-dev": { + "monolog/monolog": "^3.0", + "phpunit/phpunit": "^10.0 || ^11.0" + }, + "suggest": { + "monolog/monolog": "Allow logging debug information" + }, + "type": "library", + "autoload": { + "psr-4": { + "fivefilters\\Readability\\": "src/" + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "Apache-2.0" + ], + "authors": [ + { + "name": "Andres Rey", + "email": "andreskrey@gmail.com", + "role": "Original Developer" + }, + { + "name": "Keyvan Minoukadeh", + "email": "keyvan@fivefilters.org", + "homepage": "https://www.fivefilters.org", + "role": "Developer/Maintainer" + } + ], + "description": "A PHP port of Readability.js", + "homepage": "https://github.com/fivefilters/readability.php", + "keywords": [ + "html", + "readability" + ], + "support": { + "issues": "https://github.com/fivefilters/readability.php/issues", + "source": "https://github.com/fivefilters/readability.php/tree/v3.3.3" + }, + "time": "2025-04-26T23:45:37+00:00" + }, { "name": "fruitcake/php-cors", "version": "v1.4.0", @@ -2102,7 +2167,7 @@ }, { "name": "lvl0/fedi-discover", - "version": "dev-main", + "version": "dev-release/0.1.0", "dist": { "type": "path", "url": "packages/Lvl0/FediDiscover", @@ -2142,6 +2207,73 @@ "relative": true } }, + { + "name": "masterminds/html5", + "version": "2.10.0", + "source": { + "type": "git", + "url": "https://github.com/Masterminds/html5-php.git", + "reference": "fcf91eb64359852f00d921887b219479b4f21251" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/Masterminds/html5-php/zipball/fcf91eb64359852f00d921887b219479b4f21251", + "reference": "fcf91eb64359852f00d921887b219479b4f21251", + "shasum": "" + }, + "require": { + "ext-dom": "*", + "php": ">=5.3.0" + }, + "require-dev": { + "phpunit/phpunit": "^4.8.35 || ^5.7.21 || ^6 || ^7 || ^8 || ^9" + }, + "type": "library", + "extra": { + "branch-alias": { + "dev-master": "2.7-dev" + } + }, + "autoload": { + "psr-4": { + "Masterminds\\": "src" + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "Matt Butcher", + "email": "technosophos@gmail.com" + }, + { + "name": "Matt Farina", + "email": "matt@mattfarina.com" + }, + { + "name": "Asmir Mustafic", + "email": "goetas@gmail.com" + } + ], + "description": "An HTML5 parser and serializer.", + "homepage": "http://masterminds.github.io/html5-php", + "keywords": [ + "HTML5", + "dom", + "html", + "parser", + "querypath", + "serializer", + "xml" + ], + "support": { + "issues": "https://github.com/Masterminds/html5-php/issues", + "source": "https://github.com/Masterminds/html5-php/tree/2.10.0" + }, + "time": "2025-07-25T09:04:22+00:00" + }, { "name": "monolog/monolog", "version": "3.10.0", @@ -2653,6 +2785,57 @@ ], "time": "2026-02-16T23:10:27+00:00" }, + { + "name": "patrickschur/language-detection", + "version": "v5.3.1", + "source": { + "type": "git", + "url": "https://github.com/patrickschur/language-detection.git", + "reference": "df8d32021b2ef9fde52e6fcccb83e3806822c9c6" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/patrickschur/language-detection/zipball/df8d32021b2ef9fde52e6fcccb83e3806822c9c6", + "reference": "df8d32021b2ef9fde52e6fcccb83e3806822c9c6", + "shasum": "" + }, + "require": { + "ext-json": "*", + "ext-mbstring": "*", + "php": "^7.4 || ^8.0" + }, + "require-dev": { + "phpunit/phpunit": "^9.5.0" + }, + "type": "library", + "autoload": { + "psr-4": { + "LanguageDetection\\": "src/LanguageDetection" + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "Patrick Schur", + "email": "patrick_schur@outlook.de" + } + ], + "description": "A language detection library for PHP. Detects the language from a given text string.", + "homepage": "https://github.com/patrickschur/language-detection", + "keywords": [ + "detect", + "detection", + "language" + ], + "support": { + "issues": "https://github.com/patrickschur/language-detection/issues", + "source": "https://github.com/patrickschur/language-detection/tree/v5.3.1" + }, + "time": "2025-03-25T22:47:08+00:00" + }, { "name": "phpoption/phpoption", "version": "1.9.5", @@ -3417,6 +3600,66 @@ }, "time": "2025-12-14T04:43:48+00:00" }, + { + "name": "spatie/robots-txt", + "version": "2.5.4", + "source": { + "type": "git", + "url": "https://github.com/spatie/robots-txt.git", + "reference": "a8dd35d0a94e863f52509a366a634978e9c1db03" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/spatie/robots-txt/zipball/a8dd35d0a94e863f52509a366a634978e9c1db03", + "reference": "a8dd35d0a94e863f52509a366a634978e9c1db03", + "shasum": "" + }, + "require": { + "php": "^8.1" + }, + "require-dev": { + "phpunit/phpunit": "^11.5.2" + }, + "type": "library", + "autoload": { + "psr-4": { + "Spatie\\Robots\\": "src" + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "Brent Roose", + "email": "brent@spatie.be", + "homepage": "https://spatie.be", + "role": "Developer" + } + ], + "description": "Determine if a page may be crawled from robots.txt and robots meta tags", + "homepage": "https://github.com/spatie/robots-txt", + "keywords": [ + "robots-txt", + "spatie" + ], + "support": { + "issues": "https://github.com/spatie/robots-txt/issues", + "source": "https://github.com/spatie/robots-txt/tree/2.5.4" + }, + "funding": [ + { + "url": "https://spatie.be/open-source/support-us", + "type": "custom" + }, + { + "url": "https://github.com/spatie", + "type": "github" + } + ], + "time": "2026-02-25T07:59:20+00:00" + }, { "name": "symfony/clock", "version": "v7.4.8", @@ -3729,6 +3972,78 @@ ], "time": "2024-09-25T14:21:43+00:00" }, + { + "name": "symfony/dom-crawler", + "version": "v7.4.8", + "source": { + "type": "git", + "url": "https://github.com/symfony/dom-crawler.git", + "reference": "2918e7c2ba964defca1f5b69c6f74886529e2dc8" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/symfony/dom-crawler/zipball/2918e7c2ba964defca1f5b69c6f74886529e2dc8", + "reference": "2918e7c2ba964defca1f5b69c6f74886529e2dc8", + "shasum": "" + }, + "require": { + "masterminds/html5": "^2.6", + "php": ">=8.2", + "symfony/deprecation-contracts": "^2.5|^3", + "symfony/polyfill-ctype": "~1.8", + "symfony/polyfill-mbstring": "~1.0" + }, + "require-dev": { + "symfony/css-selector": "^6.4|^7.0|^8.0" + }, + "type": "library", + "autoload": { + "psr-4": { + "Symfony\\Component\\DomCrawler\\": "" + }, + "exclude-from-classmap": [ + "/Tests/" + ] + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "Fabien Potencier", + "email": "fabien@symfony.com" + }, + { + "name": "Symfony Community", + "homepage": "https://symfony.com/contributors" + } + ], + "description": "Eases DOM navigation for HTML and XML documents", + "homepage": "https://symfony.com", + "support": { + "source": "https://github.com/symfony/dom-crawler/tree/v7.4.8" + }, + "funding": [ + { + "url": "https://symfony.com/sponsor", + "type": "custom" + }, + { + "url": "https://github.com/fabpot", + "type": "github" + }, + { + "url": "https://github.com/nicolas-grekas", + "type": "github" + }, + { + "url": "https://tidelift.com/funding/github/packagist/symfony/symfony", + "type": "tidelift" + } + ], + "time": "2026-03-24T13:12:05+00:00" + }, { "name": "symfony/error-handler", "version": "v7.4.8", @@ -4416,7 +4731,7 @@ }, { "name": "symfony/polyfill-ctype", - "version": "v1.36.0", + "version": "v1.37.0", "source": { "type": "git", "url": "https://github.com/symfony/polyfill-ctype.git", @@ -4475,7 +4790,7 @@ "portable" ], "support": { - "source": "https://github.com/symfony/polyfill-ctype/tree/v1.36.0" + "source": "https://github.com/symfony/polyfill-ctype/tree/v1.37.0" }, "funding": [ { @@ -4499,16 +4814,16 @@ }, { "name": "symfony/polyfill-intl-grapheme", - "version": "v1.36.0", + "version": "v1.37.0", "source": { "type": "git", "url": "https://github.com/symfony/polyfill-intl-grapheme.git", - "reference": "ad1b7b9092976d6c948b8a187cec9faaea9ec1df" + "reference": "4864388bfbd3001ce88e234fab652acd91fdc57e" }, "dist": { "type": "zip", - "url": "https://api.github.com/repos/symfony/polyfill-intl-grapheme/zipball/ad1b7b9092976d6c948b8a187cec9faaea9ec1df", - "reference": "ad1b7b9092976d6c948b8a187cec9faaea9ec1df", + "url": "https://api.github.com/repos/symfony/polyfill-intl-grapheme/zipball/4864388bfbd3001ce88e234fab652acd91fdc57e", + "reference": "4864388bfbd3001ce88e234fab652acd91fdc57e", "shasum": "" }, "require": { @@ -4557,7 +4872,7 @@ "shim" ], "support": { - "source": "https://github.com/symfony/polyfill-intl-grapheme/tree/v1.36.0" + "source": "https://github.com/symfony/polyfill-intl-grapheme/tree/v1.37.0" }, "funding": [ { @@ -4577,11 +4892,11 @@ "type": "tidelift" } ], - "time": "2026-04-10T16:19:22+00:00" + "time": "2026-04-26T13:13:48+00:00" }, { "name": "symfony/polyfill-intl-idn", - "version": "v1.36.0", + "version": "v1.37.0", "source": { "type": "git", "url": "https://github.com/symfony/polyfill-intl-idn.git", @@ -4644,7 +4959,7 @@ "shim" ], "support": { - "source": "https://github.com/symfony/polyfill-intl-idn/tree/v1.36.0" + "source": "https://github.com/symfony/polyfill-intl-idn/tree/v1.37.0" }, "funding": [ { @@ -4668,7 +4983,7 @@ }, { "name": "symfony/polyfill-intl-normalizer", - "version": "v1.36.0", + "version": "v1.37.0", "source": { "type": "git", "url": "https://github.com/symfony/polyfill-intl-normalizer.git", @@ -4729,7 +5044,7 @@ "shim" ], "support": { - "source": "https://github.com/symfony/polyfill-intl-normalizer/tree/v1.36.0" + "source": "https://github.com/symfony/polyfill-intl-normalizer/tree/v1.37.0" }, "funding": [ { @@ -4753,7 +5068,7 @@ }, { "name": "symfony/polyfill-mbstring", - "version": "v1.36.0", + "version": "v1.37.0", "source": { "type": "git", "url": "https://github.com/symfony/polyfill-mbstring.git", @@ -4814,7 +5129,7 @@ "shim" ], "support": { - "source": "https://github.com/symfony/polyfill-mbstring/tree/v1.36.0" + "source": "https://github.com/symfony/polyfill-mbstring/tree/v1.37.0" }, "funding": [ { @@ -4838,7 +5153,7 @@ }, { "name": "symfony/polyfill-php80", - "version": "v1.36.0", + "version": "v1.37.0", "source": { "type": "git", "url": "https://github.com/symfony/polyfill-php80.git", @@ -4898,7 +5213,7 @@ "shim" ], "support": { - "source": "https://github.com/symfony/polyfill-php80/tree/v1.36.0" + "source": "https://github.com/symfony/polyfill-php80/tree/v1.37.0" }, "funding": [ { @@ -4922,7 +5237,7 @@ }, { "name": "symfony/polyfill-php83", - "version": "v1.36.0", + "version": "v1.37.0", "source": { "type": "git", "url": "https://github.com/symfony/polyfill-php83.git", @@ -4978,7 +5293,7 @@ "shim" ], "support": { - "source": "https://github.com/symfony/polyfill-php83/tree/v1.36.0" + "source": "https://github.com/symfony/polyfill-php83/tree/v1.37.0" }, "funding": [ { @@ -5002,7 +5317,7 @@ }, { "name": "symfony/polyfill-php84", - "version": "v1.36.0", + "version": "v1.37.0", "source": { "type": "git", "url": "https://github.com/symfony/polyfill-php84.git", @@ -5058,7 +5373,7 @@ "shim" ], "support": { - "source": "https://github.com/symfony/polyfill-php84/tree/v1.36.0" + "source": "https://github.com/symfony/polyfill-php84/tree/v1.37.0" }, "funding": [ { @@ -5082,16 +5397,16 @@ }, { "name": "symfony/polyfill-php85", - "version": "v1.36.0", + "version": "v1.37.0", "source": { "type": "git", "url": "https://github.com/symfony/polyfill-php85.git", - "reference": "2c408a6bb0313e6001a83628dc5506100474254e" + "reference": "fcfa4973a9917cef23f2e38774da74a2b7d115ee" }, "dist": { "type": "zip", - "url": "https://api.github.com/repos/symfony/polyfill-php85/zipball/2c408a6bb0313e6001a83628dc5506100474254e", - "reference": "2c408a6bb0313e6001a83628dc5506100474254e", + "url": "https://api.github.com/repos/symfony/polyfill-php85/zipball/fcfa4973a9917cef23f2e38774da74a2b7d115ee", + "reference": "fcfa4973a9917cef23f2e38774da74a2b7d115ee", "shasum": "" }, "require": { @@ -5138,7 +5453,7 @@ "shim" ], "support": { - "source": "https://github.com/symfony/polyfill-php85/tree/v1.36.0" + "source": "https://github.com/symfony/polyfill-php85/tree/v1.37.0" }, "funding": [ { @@ -5158,11 +5473,11 @@ "type": "tidelift" } ], - "time": "2026-04-10T16:50:15+00:00" + "time": "2026-04-26T13:10:57+00:00" }, { "name": "symfony/polyfill-uuid", - "version": "v1.36.0", + "version": "v1.37.0", "source": { "type": "git", "url": "https://github.com/symfony/polyfill-uuid.git", @@ -5221,7 +5536,7 @@ "uuid" ], "support": { - "source": "https://github.com/symfony/polyfill-uuid/tree/v1.36.0" + "source": "https://github.com/symfony/polyfill-uuid/tree/v1.37.0" }, "funding": [ { @@ -6059,16 +6374,16 @@ }, { "name": "voku/portable-ascii", - "version": "2.1.0", + "version": "2.1.1", "source": { "type": "git", "url": "https://github.com/voku/portable-ascii.git", - "reference": "d870a33f0f79d2b4579740b0620200221ee44aeb" + "reference": "8e1051fe39379367aecf014f41744ce7539a856f" }, "dist": { "type": "zip", - "url": "https://api.github.com/repos/voku/portable-ascii/zipball/d870a33f0f79d2b4579740b0620200221ee44aeb", - "reference": "d870a33f0f79d2b4579740b0620200221ee44aeb", + "url": "https://api.github.com/repos/voku/portable-ascii/zipball/8e1051fe39379367aecf014f41744ce7539a856f", + "reference": "8e1051fe39379367aecf014f41744ce7539a856f", "shasum": "" }, "require": { @@ -6105,7 +6420,7 @@ ], "support": { "issues": "https://github.com/voku/portable-ascii/issues", - "source": "https://github.com/voku/portable-ascii/tree/2.1.0" + "source": "https://github.com/voku/portable-ascii/tree/2.1.1" }, "funding": [ { @@ -6129,7 +6444,7 @@ "type": "tidelift" } ], - "time": "2026-04-16T23:10:39+00:00" + "time": "2026-04-26T05:33:54+00:00" } ], "packages-dev": [ diff --git a/config/cache.php b/config/cache.php index c68acdf..e3584be 100644 --- a/config/cache.php +++ b/config/cache.php @@ -112,7 +112,7 @@ | */ - 'prefix' => env('CACHE_PREFIX', Str::slug((string) env('APP_NAME', 'laravel')).'-cache-'), + 'prefix' => env('CACHE_PREFIX', Str::slug((string) env('APP_NAME', 'laravel')) . '-cache-'), /* |-------------------------------------------------------------------------- diff --git a/config/crawler.php b/config/crawler.php new file mode 100644 index 0000000..f633ce5 --- /dev/null +++ b/config/crawler.php @@ -0,0 +1,47 @@ + env('CRAWLER_TIMEOUT', 10), + + /* + |--------------------------------------------------------------------------- + | Maximum redirects to follow + |--------------------------------------------------------------------------- + | + | Guzzle default is 5. Cross-origin redirects are accepted in v0.1 — the + | search engine treats the post-redirect URL as the canonical one for + | indexing. + | + */ + + 'max_redirects' => env('CRAWLER_MAX_REDIRECTS', 5), + + /* + |--------------------------------------------------------------------------- + | User-Agent + |--------------------------------------------------------------------------- + | + | Identifies our crawler to target servers. The placeholder below is for + | v0.1 development; ticket #10 replaces it with the production identity + | and adds a `/bot` info page that the URL points at. + | + */ + + 'user_agent' => env('CRAWLER_USER_AGENT', 'TroveBot/0.1 (+https://trove.lvl0.xyz/bot)'), + + 'min_domain_delay_seconds' => env('CRAWLER_MIN_DOMAIN_DELAY_SECONDS', 10), + 'robots_cache_ttl_seconds' => env('CRAWLER_ROBOTS_CACHE_TTL_SECONDS', 60 * 60 * 24), +]; diff --git a/config/database.php b/config/database.php index 64709ce..dcf030e 100644 --- a/config/database.php +++ b/config/database.php @@ -149,7 +149,7 @@ 'options' => [ 'cluster' => env('REDIS_CLUSTER', 'redis'), - 'prefix' => env('REDIS_PREFIX', Str::slug((string) env('APP_NAME', 'laravel')).'-database-'), + 'prefix' => env('REDIS_PREFIX', Str::slug((string) env('APP_NAME', 'laravel')) . '-database-'), 'persistent' => env('REDIS_PERSISTENT', false), ], diff --git a/config/filesystems.php b/config/filesystems.php index 37d8fca..aefceac 100644 --- a/config/filesystems.php +++ b/config/filesystems.php @@ -41,7 +41,7 @@ 'public' => [ 'driver' => 'local', 'root' => storage_path('app/public'), - 'url' => rtrim(env('APP_URL', 'http://localhost'), '/').'/storage', + 'url' => rtrim(env('APP_URL', 'http://localhost'), '/') . '/storage', 'visibility' => 'public', 'throw' => false, 'report' => false, diff --git a/config/livewire.php b/config/livewire.php new file mode 100644 index 0000000..350f585 --- /dev/null +++ b/config/livewire.php @@ -0,0 +1,282 @@ + [ + resource_path('views/components'), + resource_path('views/livewire'), + ], + + /* + |--------------------------------------------------------------------------- + | Component Namespaces + |--------------------------------------------------------------------------- + | + | This value sets default namespaces that will be used to resolve view-based + | components like single-file and multi-file components. These folders'll + | also be referenced when creating new components via the make command. + | + */ + + 'component_namespaces' => [ + 'layouts' => resource_path('views/layouts'), + 'pages' => resource_path('views/pages'), + ], + + /* + |--------------------------------------------------------------------------- + | Page Layout + |--------------------------------------------------------------------------- + | The view that will be used as the layout when rendering a single component as + | an entire page via `Route::livewire('/post/create', 'pages::create-post')`. + | In this case, the content of pages::create-post will render into $slot. + | + */ + + 'component_layout' => 'layouts::app', + + /* + |--------------------------------------------------------------------------- + | Lazy Loading Placeholder + |--------------------------------------------------------------------------- + | Livewire allows you to lazy load components that would otherwise slow down + | the initial page load. Every component can have a custom placeholder or + | you can define the default placeholder view for all components below. + | + */ + + 'component_placeholder' => null, // Example: 'placeholders::skeleton' + + /* + |--------------------------------------------------------------------------- + | Make Command + |--------------------------------------------------------------------------- + | This value determines the default configuration for the artisan make command + | You can configure the component type (sfc, mfc, class) and whether to use + | the high-voltage (⚡) emoji as a prefix in the sfc|mfc component names. + | + */ + + 'make_command' => [ + 'type' => 'class', // Options: 'sfc', 'mfc', 'class' + 'emoji' => false, // Options: true, false + 'with' => [ + 'js' => false, + 'css' => false, + 'test' => false, + ], + ], + + /* + |--------------------------------------------------------------------------- + | Class Namespace + |--------------------------------------------------------------------------- + | + | This value sets the root class namespace for Livewire component classes in + | your application. This value will change where component auto-discovery + | finds components. It's also referenced by the file creation commands. + | + */ + + 'class_namespace' => 'App\\Livewire', + + /* + |--------------------------------------------------------------------------- + | Class Path + |--------------------------------------------------------------------------- + | + | This value is used to specify the path where Livewire component class files + | are created when running creation commands like `artisan make:livewire`. + | This path is customizable to match your projects directory structure. + | + */ + + 'class_path' => app_path('Livewire'), + + /* + |--------------------------------------------------------------------------- + | View Path + |--------------------------------------------------------------------------- + | + | This value is used to specify where Livewire component Blade templates are + | stored when running file creation commands like `artisan make:livewire`. + | It is also used if you choose to omit a component's render() method. + | + */ + + 'view_path' => resource_path('views/livewire'), + + /* + |--------------------------------------------------------------------------- + | Temporary File Uploads + |--------------------------------------------------------------------------- + | + | Livewire handles file uploads by storing uploads in a temporary directory + | before the file is stored permanently. All file uploads are directed to + | a global endpoint for temporary storage. You may configure this below: + | + */ + + 'temporary_file_upload' => [ + 'disk' => env('LIVEWIRE_TEMPORARY_FILE_UPLOAD_DISK'), // Example: 'local', 's3' | Default: 'default' + 'rules' => null, // Example: ['file', 'mimes:png,jpg'] | Default: ['required', 'file', 'max:12288'] (12MB) + 'directory' => null, // Example: 'tmp' | Default: 'livewire-tmp' + 'middleware' => null, // Example: 'throttle:5,1' | Default: 'throttle:60,1' + 'preview_mimes' => [ // Supported file types for temporary pre-signed file URLs... + 'png', 'gif', 'bmp', 'svg', 'wav', 'mp4', + 'mov', 'avi', 'wmv', 'mp3', 'm4a', + 'jpg', 'jpeg', 'mpga', 'webp', 'wma', + ], + 'max_upload_time' => 5, // Max duration (in minutes) before an upload is invalidated... + 'cleanup' => true, // Should cleanup temporary uploads older than 24 hrs... + ], + + /* + |--------------------------------------------------------------------------- + | Render On Redirect + |--------------------------------------------------------------------------- + | + | This value determines if Livewire will run a component's `render()` method + | after a redirect has been triggered using something like `redirect(...)` + | Setting this to true will render the view once more before redirecting + | + */ + + 'render_on_redirect' => false, + + /* + |--------------------------------------------------------------------------- + | Eloquent Model Binding + |--------------------------------------------------------------------------- + | + | Previous versions of Livewire supported binding directly to eloquent model + | properties using wire:model by default. However, this behavior has been + | deemed too "magical" and has therefore been put under a feature flag. + | + */ + + 'legacy_model_binding' => false, + + /* + |--------------------------------------------------------------------------- + | Auto-inject Frontend Assets + |--------------------------------------------------------------------------- + | + | By default, Livewire automatically injects its JavaScript and CSS into the + | and of pages containing Livewire components. By disabling + | this behavior, you need to use @livewireStyles and @livewireScripts. + | + */ + + 'inject_assets' => true, + + /* + |--------------------------------------------------------------------------- + | Navigate (SPA mode) + |--------------------------------------------------------------------------- + | + | By adding `wire:navigate` to links in your Livewire application, Livewire + | will prevent the default link handling and instead request those pages + | via AJAX, creating an SPA-like effect. Configure this behavior here. + | + */ + + 'navigate' => [ + 'show_progress_bar' => true, + 'progress_bar_color' => '#2299dd', + ], + + /* + |--------------------------------------------------------------------------- + | HTML Morph Markers + |--------------------------------------------------------------------------- + | + | Livewire intelligently "morphs" existing HTML into the newly rendered HTML + | after each update. To make this process more reliable, Livewire injects + | "markers" into the rendered Blade surrounding @if, @class & @foreach. + | + */ + + 'inject_morph_markers' => true, + + /* + |--------------------------------------------------------------------------- + | Smart Wire Keys + |--------------------------------------------------------------------------- + | + | Livewire uses loops and keys used within loops to generate smart keys that + | are applied to nested components that don't have them. This makes using + | nested components more reliable by ensuring that they all have keys. + | + */ + + 'smart_wire_keys' => true, + + /* + |--------------------------------------------------------------------------- + | Pagination Theme + |--------------------------------------------------------------------------- + | + | When enabling Livewire's pagination feature by using the `WithPagination` + | trait, Livewire will use Tailwind templates to render pagination views + | on the page. If you want Bootstrap CSS, you can specify: "bootstrap" + | + */ + + 'pagination_theme' => 'tailwind', + + /* + |--------------------------------------------------------------------------- + | Release Token + |--------------------------------------------------------------------------- + | + | This token is stored client-side and sent along with each request to check + | a users session to see if a new release has invalidated it. If there is + | a mismatch it will throw an error and prompt for a browser refresh. + | + */ + + 'release_token' => 'a', + + /* + |--------------------------------------------------------------------------- + | CSP Safe + |--------------------------------------------------------------------------- + | + | This config is used to determine if Livewire will use the CSP-safe version + | of Alpine in its bundle. This is useful for applications that are using + | strict Content Security Policy (CSP) to protect against XSS attacks. + | + */ + + 'csp_safe' => false, + + /* + |--------------------------------------------------------------------------- + | Payload Guards + |--------------------------------------------------------------------------- + | + | These settings protect against malicious or oversized payloads that could + | cause denial of service. The default values should feel reasonable for + | most web applications. Each can be set to null to disable the limit. + | + */ + + 'payload' => [ + 'max_size' => 1024 * 1024, // 1MB - maximum request payload size in bytes + 'max_nesting_depth' => 10, // Maximum depth of dot-notation property paths + 'max_calls' => 50, // Maximum method calls per request + 'max_components' => 20, // Maximum components per batch request + ], +]; diff --git a/config/logging.php b/config/logging.php index b09cb25..b0f50f7 100644 --- a/config/logging.php +++ b/config/logging.php @@ -89,7 +89,7 @@ 'handler_with' => [ 'host' => env('PAPERTRAIL_URL'), 'port' => env('PAPERTRAIL_PORT'), - 'connectionString' => 'tls://'.env('PAPERTRAIL_URL').':'.env('PAPERTRAIL_PORT'), + 'connectionString' => 'tls://' . env('PAPERTRAIL_URL') . ':' . env('PAPERTRAIL_PORT'), ], 'processors' => [PsrLogMessageProcessor::class], ], diff --git a/config/services.php b/config/services.php index 6a90eb8..43eb064 100644 --- a/config/services.php +++ b/config/services.php @@ -14,6 +14,12 @@ | */ + 'ntfy' => [ + 'url' => env('NTFY_URL') ?: null, + 'topic' => env('NTFY_TOPIC') ?: null, + 'threshold' => env('NTFY_THRESHOLD') !== null ? (int) env('NTFY_THRESHOLD') : null, + ], + 'postmark' => [ 'key' => env('POSTMARK_API_KEY'), ], diff --git a/config/session.php b/config/session.php index f574482..c785fbc 100644 --- a/config/session.php +++ b/config/session.php @@ -129,7 +129,7 @@ 'cookie' => env( 'SESSION_COOKIE', - Str::slug((string) env('APP_NAME', 'laravel')).'-session' + Str::slug((string) env('APP_NAME', 'laravel')) . '-session' ), /* diff --git a/database/factories/PageCrawlFactory.php b/database/factories/PageCrawlFactory.php new file mode 100644 index 0000000..cdd6289 --- /dev/null +++ b/database/factories/PageCrawlFactory.php @@ -0,0 +1,53 @@ + + */ +class PageCrawlFactory extends Factory +{ + public function definition(): array + { + return [ + 'page_id' => null, + 'domain' => 'example.com', + 'priority' => 0, + 'completed_at' => null, + 'outcome' => null, + 'status_code' => null, + 'error_message' => null, + ]; + } + + public function page(Page $page): static + { + return $this->state(fn () => [ + 'page_id' => $page->id, + ]); + } + + public function successful(): static + { + return $this->state(fn () => [ + 'outcome' => CrawlOutcomeEnum::Success, + 'completed_at' => now(), + ]); + } + + public function failed(string $errorMessage): static + { + return $this->state(fn () => [ + 'outcome' => CrawlOutcomeEnum::Failed, + 'completed_at' => now(), + 'error_message' => $errorMessage, + ]); + } +} diff --git a/database/factories/PageFactory.php b/database/factories/PageFactory.php new file mode 100644 index 0000000..55f62ca --- /dev/null +++ b/database/factories/PageFactory.php @@ -0,0 +1,26 @@ + + */ +class PageFactory extends Factory +{ + /** + * @return array + */ + public function definition(): array + { + return [ + 'url' => fake()->url(), + 'status' => PageStatusEnum::Discovered, + ]; + } +} diff --git a/database/factories/PageLinkFactory.php b/database/factories/PageLinkFactory.php new file mode 100644 index 0000000..57a2b6f --- /dev/null +++ b/database/factories/PageLinkFactory.php @@ -0,0 +1,34 @@ + + */ +class PageLinkFactory extends Factory +{ + public function definition(): array + { + return []; + } + + public function withSource(Page $page): static + { + return $this->state(fn () => [ + 'source_page_id' => $page->id, + ]); + } + + public function withTarget(Page $page): static + { + return $this->state(fn () => [ + 'target_page_id' => $page->id, + ]); + } +} diff --git a/database/migrations/2026_04_25_234157_create_pages_table.php b/database/migrations/2026_04_25_234157_create_pages_table.php new file mode 100644 index 0000000..2379f87 --- /dev/null +++ b/database/migrations/2026_04_25_234157_create_pages_table.php @@ -0,0 +1,36 @@ +id(); + $table->text('url')->unique(); + $table->string('status')->default(PageStatusEnum::Discovered->value)->index(); + $table->string('language', 35)->nullable()->index(); + $table->decimal('language_confidence', 4, 3)->nullable(); + $table->string('title')->nullable(); + $table->foreignId('instance_id') + ->nullable() + ->constrained('fedi_discover_instances') + ->nullOnDelete(); + $table->timestampTz('posted_at')->nullable(); + $table->timestampTz('fetched_at')->nullable(); + $table->timestampTz('failed_at')->nullable(); + $table->timestampsTz(); + }); + } + + public function down(): void + { + Schema::dropIfExists('pages'); + } +}; diff --git a/database/migrations/2026_04_26_001957_create_page_links_table.php b/database/migrations/2026_04_26_001957_create_page_links_table.php new file mode 100644 index 0000000..b67328c --- /dev/null +++ b/database/migrations/2026_04_26_001957_create_page_links_table.php @@ -0,0 +1,27 @@ +id(); + $table->foreignId('source_page_id')->constrained('pages'); + $table->foreignId('target_page_id')->constrained('pages'); + $table->timestampsTz(); + + $table->unique(['source_page_id', 'target_page_id']); + }); + } + + public function down(): void + { + Schema::dropIfExists('page_links'); + } +}; diff --git a/database/migrations/2026_04_26_111140_create_page_crawls_table.php b/database/migrations/2026_04_26_111140_create_page_crawls_table.php new file mode 100644 index 0000000..9e18d9a --- /dev/null +++ b/database/migrations/2026_04_26_111140_create_page_crawls_table.php @@ -0,0 +1,34 @@ +id(); + $table->foreignId('page_id') + ->constrained('pages') + ->cascadeOnDelete(); + $table->string('domain'); + $table->smallInteger('priority')->default(0); + $table->timestampTz('completed_at')->nullable(); + $table->string('outcome')->nullable(); + $table->smallInteger('status_code')->nullable(); + $table->text('error_message')->nullable(); + $table->timestampsTz(); + + $table->index(['page_id', 'created_at']); + }); + } + + public function down(): void + { + Schema::dropIfExists('page_crawls'); + } +}; diff --git a/docker/prod/Dockerfile b/docker/prod/Dockerfile new file mode 100644 index 0000000..46d8c2f --- /dev/null +++ b/docker/prod/Dockerfile @@ -0,0 +1,128 @@ +# syntax=docker/dockerfile:1 + +# ============================================================ +# Stage 1: Build frontend assets +# ============================================================ +FROM node:20-alpine AS frontend + +WORKDIR /app + +COPY package.json package-lock.json vite.config.js ./ +COPY resources/ resources/ + +RUN npm ci --no-audit --no-fund +RUN npm run build + +# ============================================================ +# Stage 2: Runtime (FrankenPHP) +# ============================================================ +FROM dunglas/frankenphp:1.1-php8.3-alpine AS runtime + +RUN apk add --no-cache \ + git \ + postgresql-client \ + curl + +RUN install-php-extensions \ + pdo_pgsql \ + redis \ + opcache \ + zip \ + gd \ + intl + +COPY --from=composer:2 /usr/bin/composer /usr/bin/composer + +WORKDIR /app + +ENV APP_ENV=production \ + APP_DEBUG=false \ + LOG_CHANNEL=stack \ + LOG_LEVEL=warning \ + DB_CONNECTION=pgsql \ + DB_HOST=db \ + DB_PORT=5432 \ + REDIS_HOST=redis \ + REDIS_PORT=6379 \ + CACHE_STORE=redis \ + QUEUE_CONNECTION=redis \ + SESSION_DRIVER=redis \ + BROADCAST_CONNECTION=log \ + MAIL_MAILER=log + +# Copy only the files composer needs before install, so the composer layer stays +# cached when application source changes. packages/ is required because composer.json +# declares it as a path repository. +COPY composer.json composer.lock ./ +COPY packages/ packages/ + +# Skip post-autoload scripts (package:discover) during build — they need a runtime +# Laravel boot which fails without proper env. Discovery happens at runtime via +# start-prod.sh. --classmap-authoritative implies --optimize-autoloader. +RUN composer install --no-dev --no-interaction --prefer-dist --classmap-authoritative --no-scripts + +COPY . . +COPY --from=frontend /app/public/build /app/public/build + +RUN chown -R www-data:www-data /app/storage /app/bootstrap/cache + +RUN cat > /etc/caddy/Caddyfile <<'EOF' +{ + frankenphp + order php_server before file_server +} + +:8000 { + root * /app/public + + php_server { + index index.php + } + + encode gzip zstd + + file_server + + header { + X-Frame-Options "SAMEORIGIN" + X-Content-Type-Options "nosniff" + Referrer-Policy "strict-origin-when-cross-origin" + } +} +EOF + +EXPOSE 8000 + +HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \ + CMD curl -fsS http://localhost:8000/up || exit 1 + +RUN cat > /start-prod.sh <<'EOF' +#!/bin/sh +set -e + +echo "Waiting for PostgreSQL at ${DB_HOST}:${DB_PORT}..." +for i in $(seq 1 60); do + if pg_isready -h "${DB_HOST}" -p "${DB_PORT}" -q; then + echo "PostgreSQL is ready." + break + fi + if [ "$i" = "60" ]; then + echo "Timed out waiting for PostgreSQL after 60s." >&2 + exit 1 + fi + sleep 1 +done + +php artisan package:discover --ansi +php artisan config:cache +php artisan route:cache +php artisan view:cache + +php artisan migrate --force + +exec frankenphp run --config /etc/caddy/Caddyfile +EOF + +RUN chmod +x /start-prod.sh + +CMD ["/start-prod.sh"] diff --git a/packages/Lvl0/FediDiscover/config/fedi-discover.php b/packages/Lvl0/FediDiscover/config/fedi-discover.php index 355f9f3..3dff16a 100644 --- a/packages/Lvl0/FediDiscover/config/fedi-discover.php +++ b/packages/Lvl0/FediDiscover/config/fedi-discover.php @@ -3,5 +3,20 @@ declare(strict_types=1); return [ - // Instance list, polling intervals, and HTTP client config land here. + 'http' => [ + 'timeout' => 10, + // Default points at the project site so fediverse admins can always trace a Trove poller + // back to the project. Operators running their own deployment should override this via + // `php artisan vendor:publish --tag=fedi-discover-config` with their own contact URL. + 'user_agent' => 'Trove/1.0 (+https://trove.lvl0.xyz)', + 'max_redirects' => 3, + ], + + 'defaults' => [ + // Minimum recommended: 60. Mastodon/Lemmy rate limits apply per-instance. + 'interval_seconds' => 300, + ], + + // Instances are DB-managed (table: fedi_discover_instances). + // See the Instance model + admin UI (TBD). No instance list here. ]; diff --git a/packages/Lvl0/FediDiscover/database/migrations/2026_04_23_205027_create_fedi_discover_instances_table.php b/packages/Lvl0/FediDiscover/database/migrations/2026_04_23_205027_create_fedi_discover_instances_table.php new file mode 100644 index 0000000..209c3f7 --- /dev/null +++ b/packages/Lvl0/FediDiscover/database/migrations/2026_04_23_205027_create_fedi_discover_instances_table.php @@ -0,0 +1,34 @@ +id(); + $table->string('type'); + // Instance origin, e.g. https://mastodon.social. Not a full endpoint path. + $table->string('url'); + $table->boolean('enabled')->default(true); + $table->unsignedInteger('interval_seconds')->default(300); + $table->json('extras')->default('{}'); + $table->unsignedInteger('consecutive_poll_failures')->default(0); + $table->timestampTz('last_polled_at')->nullable(); + $table->string('last_seen_id')->nullable(); + $table->timestamps(); + + $table->unique(['type', 'url']); + }); + } + + public function down(): void + { + Schema::dropIfExists('fedi_discover_instances'); + } +}; diff --git a/packages/Lvl0/FediDiscover/src/Actions/PollFediverseAction.php b/packages/Lvl0/FediDiscover/src/Actions/PollFediverseAction.php new file mode 100644 index 0000000..d55da41 --- /dev/null +++ b/packages/Lvl0/FediDiscover/src/Actions/PollFediverseAction.php @@ -0,0 +1,83 @@ +factory->for($instance); + $posts = $client->fetchPostsSince($instance, $instance->last_seen_id); + + $urlCount = $posts + ->map(function (FediversePost $post) use ($instance) { + try { + return $this->processLinks($post, $instance); + } catch (Throwable $e) { + Log::warning('fedi-discover:processLinks failed', [ + 'instance_id' => $instance->id, + 'instance_url' => $instance->url, + 'post_url' => $post->selfUrl, + 'exception' => $e::class, + 'message' => $e->getMessage(), + ]); + } + }) + ->sum(); + + if ($posts->isNotEmpty()) { + $instance->last_seen_id = $posts->first()->cursorId; + } + + $instance->consecutive_poll_failures = 0; + $instance->last_polled_at = now(); + $instance->save(); + + Log::info('fedi-discover:poll succeeded', [ + 'instance_id' => $instance->id, + 'url_count' => $urlCount, + 'duration_ms' => (int) round((microtime(true) - $start) * 1000), + ]); + } + + private function processLinks(FediversePost $post, Instance $instance): int + { + if ($post->body === null) { + return 0; + } + + $linksFound = preg_match_all('~https?://[^\s<>"\'()\[\]]+~', $post->body, $matches); + + if ($linksFound === 0) { + return 0; + } + + return collect($matches[0]) + ->map(fn (string $u) => rtrim($u, '.,;:!?')) + ->filter(fn (string $u) => filter_var($u, FILTER_VALIDATE_URL) !== false) + ->filter(fn (string $u) => parse_url($u, PHP_URL_HOST) !== parse_url($instance->url, PHP_URL_HOST)) + ->unique() + ->each(fn (string $url) => UrlDiscovered::dispatch( + url: $url, + instanceId: $instance->id, + discoveredAt: CarbonImmutable::now(), + postUrl: $post->selfUrl, + postBody: $post->body, + )) + ->count(); + } +} diff --git a/packages/Lvl0/FediDiscover/src/Clients/FediverseClientFactory.php b/packages/Lvl0/FediDiscover/src/Clients/FediverseClientFactory.php new file mode 100644 index 0000000..5cb96ca --- /dev/null +++ b/packages/Lvl0/FediDiscover/src/Clients/FediverseClientFactory.php @@ -0,0 +1,24 @@ +type) { + InstanceType::Mastodon => $this->mastodonClient, + InstanceType::Lemmy => $this->lemmyClient, + }; + } +} diff --git a/packages/Lvl0/FediDiscover/src/Clients/FediverseClientInterface.php b/packages/Lvl0/FediDiscover/src/Clients/FediverseClientInterface.php new file mode 100644 index 0000000..de74dfa --- /dev/null +++ b/packages/Lvl0/FediDiscover/src/Clients/FediverseClientInterface.php @@ -0,0 +1,22 @@ + + */ + public function fetchPostsSince(Instance $instance, ?string $lastSeenId): Collection; +} diff --git a/packages/Lvl0/FediDiscover/src/Clients/LemmyClient.php b/packages/Lvl0/FediDiscover/src/Clients/LemmyClient.php new file mode 100644 index 0000000..792972d --- /dev/null +++ b/packages/Lvl0/FediDiscover/src/Clients/LemmyClient.php @@ -0,0 +1,43 @@ +url, PHP_URL_HOST) . '/api/v3/post/list'; + + $params = $lastSeenId !== null ? ['min_id' => $lastSeenId] : []; + + $response = Http::withHeaders([ + 'User-Agent' => config('fedi-discover.http.user_agent'), + ])->timeout(config('fedi-discover.http.timeout'))->get($url, $params); + + if (! $response->successful()) { + return collect(); + } + + return collect($response->json('posts', [])) + ->map(fn (array $p) => $p['post']) + ->map(function (array $t) { + $parts = array_filter([$t['body'] ?? null, $t['url'] ?? null]); + $body = $parts ? implode(' ', $parts) : null; + + return new FediversePost( + cursorId: (string) $t['id'], + selfUrl: $t['ap_id'], + body: $body, + title: $t['name'], + publishedAt: $t['published'] + ); + }); + } +} diff --git a/packages/Lvl0/FediDiscover/src/Clients/MastodonClient.php b/packages/Lvl0/FediDiscover/src/Clients/MastodonClient.php new file mode 100644 index 0000000..e2ac205 --- /dev/null +++ b/packages/Lvl0/FediDiscover/src/Clients/MastodonClient.php @@ -0,0 +1,36 @@ +url, PHP_URL_HOST) . '/api/v1/timelines/public'; + + $params = $lastSeenId !== null ? ['min_id' => $lastSeenId] : []; + + $response = Http::withHeaders([ + 'User-Agent' => config('fedi-discover.http.user_agent'), + ])->timeout(config('fedi-discover.http.timeout'))->get($url, $params); + + if (! $response->successful()) { + return collect(); + } + + return collect($response->json() ?? []) + ->map(fn (array $t) => new FediversePost( + cursorId: $t['id'], + selfUrl: $t['url'] ?? $t['uri'] ?? null, + body: $t['content'], + publishedAt: $t['created_at'] ?? null + )); + } +} diff --git a/packages/Lvl0/FediDiscover/src/Config/InstanceConfig.php b/packages/Lvl0/FediDiscover/src/Config/InstanceConfig.php new file mode 100644 index 0000000..b4576a4 --- /dev/null +++ b/packages/Lvl0/FediDiscover/src/Config/InstanceConfig.php @@ -0,0 +1,65 @@ + $extras + */ + public function __construct( + public InstanceType $type, + public string $url, + public bool $enabled, + public int $intervalSeconds, + public array $extras + ) {} + + /** + * @throws InvalidArgumentException + */ + public static function fromArray(array $array): self + { + foreach (['type', 'url', 'enabled', 'interval_seconds'] as $key) { + if (! array_key_exists($key, $array)) { + throw new InvalidArgumentException("Missing required key: {$key}"); + } + } + + if ($array['interval_seconds'] <= 0) { + throw new InvalidArgumentException('Interval seconds needs to be larger than zero'); + } + + $type = InstanceType::tryFrom($array['type']); + if ($type === null) { + throw new InvalidArgumentException('Invalid type: ' . $array['type']); + } + + if (filter_var($array['url'], FILTER_VALIDATE_URL) === false) { + throw new InvalidArgumentException('Invalid URL: ' . $array['url']); + } + + return new self( + type: $type, + url: $array['url'], + enabled: $array['enabled'], + intervalSeconds: $array['interval_seconds'], + extras: $array['extras'] ?? [] + ); + } + + public function toArray(): array + { + return [ + 'type' => $this->type->value, + 'url' => $this->url, + 'enabled' => $this->enabled, + 'interval_seconds' => $this->intervalSeconds, + 'extras' => $this->extras, + ]; + } +} diff --git a/packages/Lvl0/FediDiscover/src/Config/InstanceType.php b/packages/Lvl0/FediDiscover/src/Config/InstanceType.php new file mode 100644 index 0000000..b7c4fce --- /dev/null +++ b/packages/Lvl0/FediDiscover/src/Config/InstanceType.php @@ -0,0 +1,11 @@ +get() + ->map(function (Instance $instance) { + try { + $this->action->execute($instance); + + return ['instance_id' => $instance->id, 'status' => 'success']; + } catch (Throwable $e) { + $this->error("Failed to poll {$instance->url}: {$e->getMessage()}"); + Log::warning('fedi-discover:poll failed', [ + 'instance_id' => $instance->id, + 'instance_url' => $instance->url, + 'exception' => $e::class, + 'message' => $e->getMessage(), + ]); + + return ['instance' => $instance, 'status' => 'error', 'error' => $e->getMessage()]; + } + }) + ->filter(fn (array $res) => $res['status'] === 'error'); + + if ($errors->isEmpty()) { + return self::SUCCESS; + } + + $errors->each(fn (array $errorArr) => PollFailed::dispatch( + $errorArr['instance'], + $errorArr['error'], + now()->toImmutable(), + )); + + return self::FAILURE; + } +} diff --git a/packages/Lvl0/FediDiscover/src/Console/Commands/ValidateInstancesCommand.php b/packages/Lvl0/FediDiscover/src/Console/Commands/ValidateInstancesCommand.php new file mode 100644 index 0000000..99cbcd1 --- /dev/null +++ b/packages/Lvl0/FediDiscover/src/Console/Commands/ValidateInstancesCommand.php @@ -0,0 +1,64 @@ +option('enabled-only')) { + $instances->enabled(); + } + + $instances = $instances->get(); + + $invalidInstances = collect(); + + $instances->each(function (Instance $instance) use ($invalidInstances) { + $reasons = collect(); + + if (filter_var($instance->url, FILTER_VALIDATE_URL) === false) { + $reasons->add('Invalid URL: ' . $instance->url); + } + + if ($instance->interval_seconds < 1) { + $reasons->add('Invalid interval seconds: ' . $instance->interval_seconds); + } + + if ($reasons->isNotEmpty()) { + $invalidInstances->add([ + 'instance' => $instance, + 'reasons' => $reasons, + ]); + } + }); + + $this->info((string) $instances->count()); + $this->info(($instances->count() - $invalidInstances->count()) . ' valid'); + $this->line($invalidInstances->count() . ' invalid'); + + if ($invalidInstances->isNotEmpty()) { + $invalidInstances->each(function (array $instanceArray) { + $instance = $instanceArray['instance']; + $reason = $instanceArray['reasons']->join(', '); + $this->warn($instance->id . ' - ' . $instance->url); + $this->line(' : ' . $reason); + }); + + return self::FAILURE; + } + + return self::SUCCESS; + } +} diff --git a/packages/Lvl0/FediDiscover/src/Database/Factories/InstanceFactory.php b/packages/Lvl0/FediDiscover/src/Database/Factories/InstanceFactory.php new file mode 100644 index 0000000..1b7e74d --- /dev/null +++ b/packages/Lvl0/FediDiscover/src/Database/Factories/InstanceFactory.php @@ -0,0 +1,54 @@ + + */ +class InstanceFactory extends Factory +{ + protected $model = Instance::class; + + /** + * @return array + */ + public function definition(): array + { + return [ + 'type' => null, + 'url' => fake()->url, + 'enabled' => null, + 'interval_seconds' => 600, + 'extras' => [], + 'last_seen_id' => null, + 'last_polled_at' => now(), + ]; + } + + public function type(InstanceType $type): self + { + return $this->state(fn () => [ + 'type' => $type->value, + ]); + } + + public function enabled(): self + { + return $this->state(fn () => [ + 'enabled' => true, + ]); + } + + public function disabled(): self + { + return $this->state(fn () => [ + 'enabled' => false, + ]); + } +} diff --git a/packages/Lvl0/FediDiscover/src/Events/PollFailed.php b/packages/Lvl0/FediDiscover/src/Events/PollFailed.php new file mode 100644 index 0000000..56c7b55 --- /dev/null +++ b/packages/Lvl0/FediDiscover/src/Events/PollFailed.php @@ -0,0 +1,23 @@ +mergeConfigFrom(__DIR__.'/../config/fedi-discover.php', 'fedi-discover'); + $this->mergeConfigFrom(__DIR__ . '/../config/fedi-discover.php', 'fedi-discover'); + + $this->app->singleton(FediverseClientFactory::class); } public function boot(): void { + $this->loadMigrationsFrom(__DIR__ . '/../database/migrations'); + if ($this->app->runningInConsole()) { $this->publishes([ - __DIR__.'/../config/fedi-discover.php' => config_path('fedi-discover.php'), + __DIR__ . '/../config/fedi-discover.php' => config_path('fedi-discover.php'), ], 'fedi-discover-config'); + + $this->commands([ + PollInstancesCommand::class, + ValidateInstancesCommand::class, + ]); } } } diff --git a/packages/Lvl0/FediDiscover/src/Models/Instance.php b/packages/Lvl0/FediDiscover/src/Models/Instance.php new file mode 100644 index 0000000..9d61119 --- /dev/null +++ b/packages/Lvl0/FediDiscover/src/Models/Instance.php @@ -0,0 +1,64 @@ + $extras + * @property string|null $last_seen_id + * @property int $consecutive_poll_failures + * @property Carbon|null $last_polled_at + * @property Carbon $created_at + * @property Carbon $updated_at + */ +class Instance extends Model +{ + /** @use HasFactory */ + use HasFactory; + + protected $table = 'fedi_discover_instances'; + + protected $fillable = ['type', 'url', 'enabled', 'interval_seconds', 'extras', 'last_seen_id', 'last_polled_at', 'consecutive_poll_failures']; + + protected $casts = [ + 'type' => InstanceType::class, + 'enabled' => 'boolean', + 'extras' => 'array', + 'last_polled_at' => 'datetime', + ]; + + /** + * @param Builder $query + * @return Builder + */ + public function scopeEnabled(Builder $query): Builder + { + return $query->where('enabled', true); + } + + protected static function newFactory(): Factory + { + return InstanceFactory::new(); + } + + public function pages(): HasMany + { + return $this->hasMany(Page::class); + } +} diff --git a/packages/Lvl0/FediDiscover/src/ValueObjects/FediversePost.php b/packages/Lvl0/FediDiscover/src/ValueObjects/FediversePost.php new file mode 100644 index 0000000..987a84c --- /dev/null +++ b/packages/Lvl0/FediDiscover/src/ValueObjects/FediversePost.php @@ -0,0 +1,16 @@ + InstanceType::Mastodon, 'url' => 'https://mastodon.social']); + + $client = $factory->for($instance); + + $this->assertInstanceOf(MastodonClient::class, $client); + } + + public function test_it_resolves_lemmy_client_for_lemmy_instance_type(): void + { + $factory = app(FediverseClientFactory::class); + + $instance = new Instance(['type' => InstanceType::Lemmy, 'url' => 'https://lemmy.world']); + + $client = $factory->for($instance); + + $this->assertInstanceOf(LemmyClient::class, $client); + } + + public function test_it_is_registered_as_a_singleton_in_the_container(): void + { + $a = $this->app->make(FediverseClientFactory::class); + $b = $this->app->make(FediverseClientFactory::class); + + $this->assertSame($a, $b); + } +} diff --git a/packages/Lvl0/FediDiscover/tests/Feature/InstanceConfigPersistenceTest.php b/packages/Lvl0/FediDiscover/tests/Feature/InstanceConfigPersistenceTest.php new file mode 100644 index 0000000..ee35919 --- /dev/null +++ b/packages/Lvl0/FediDiscover/tests/Feature/InstanceConfigPersistenceTest.php @@ -0,0 +1,57 @@ + InstanceType::Mastodon->value, + 'url' => 'https://mastodon.social', + 'enabled' => true, + 'interval_seconds' => 600, + 'extras' => ['token' => 'abc123'], + ]); + + Instance::create($config->toArray()); + + $this->artisan('fedi-discover:validate') + ->assertExitCode(0); + } + + public function test_an_instance_config_survives_a_write_read_cycle_through_the_model(): void + { + $original = InstanceConfig::fromArray([ + 'type' => InstanceType::Mastodon->value, + 'url' => 'https://hachyderm.io', + 'enabled' => false, + 'interval_seconds' => 900, + 'extras' => ['foo' => 'bar'], + ]); + + Instance::create($original->toArray()); + + $instance = Instance::query()->firstOrFail(); + + $roundTripped = InstanceConfig::fromArray([ + 'type' => $instance->type->value, + 'url' => $instance->url, + 'enabled' => $instance->enabled, + 'interval_seconds' => $instance->interval_seconds, + 'extras' => $instance->extras, + ]); + + $this->assertEquals($original, $roundTripped); + } +} diff --git a/packages/Lvl0/FediDiscover/tests/Feature/InstanceModelTest.php b/packages/Lvl0/FediDiscover/tests/Feature/InstanceModelTest.php new file mode 100644 index 0000000..d6cde01 --- /dev/null +++ b/packages/Lvl0/FediDiscover/tests/Feature/InstanceModelTest.php @@ -0,0 +1,113 @@ + InstanceType::Mastodon, + 'url' => 'https://mastodon.social', + 'enabled' => true, + 'interval_seconds' => 600, + 'extras' => ['token' => 'abc123'], + ]); + + $instance = Instance::first(); + + $this->assertNotNull($instance); + $this->assertSame(InstanceType::Mastodon, $instance->type); + $this->assertSame('https://mastodon.social', $instance->url); + $this->assertTrue($instance->enabled); + $this->assertSame(600, $instance->interval_seconds); + $this->assertSame(['token' => 'abc123'], $instance->extras); + } + + public function test_enabled_is_fillable_and_cast_to_boolean(): void + { + $instance = Instance::create([ + 'type' => InstanceType::Mastodon, + 'url' => 'https://mastodon.social', + 'enabled' => false, + 'interval_seconds' => 600, + ]); + + $this->assertFalse($instance->fresh()->enabled); + } + + public function test_last_polled_at_is_fillable_and_cast_to_datetime(): void + { + $polledAt = Carbon::parse('2026-04-23 12:00:00'); + + $instance = Instance::create([ + 'type' => InstanceType::Mastodon, + 'url' => 'https://mastodon.social', + 'enabled' => true, + 'interval_seconds' => 600, + 'last_polled_at' => $polledAt, + ]); + + $fresh = $instance->fresh(); + + $this->assertInstanceOf(Carbon::class, $fresh->last_polled_at); + $this->assertTrue($fresh->last_polled_at->equalTo($polledAt)); + } + + public function test_last_seen_id_defaults_to_null(): void + { + $instance = Instance::create([ + 'type' => InstanceType::Mastodon, + 'url' => 'https://mastodon.social', + 'enabled' => true, + 'interval_seconds' => 600, + ]); + + $this->assertNull($instance->fresh()->last_seen_id); + } + + public function test_last_seen_id_is_fillable_and_persists_as_string(): void + { + $instance = Instance::create([ + 'type' => InstanceType::Mastodon, + 'url' => 'https://mastodon.social', + 'enabled' => true, + 'interval_seconds' => 600, + 'last_seen_id' => '109876543210', + ]); + + $this->assertSame('109876543210', $instance->fresh()->last_seen_id); + } + + public function test_enabled_scope_returns_only_enabled_instances(): void + { + Instance::create([ + 'type' => InstanceType::Mastodon, + 'url' => 'https://enabled.example', + 'enabled' => true, + 'interval_seconds' => 600, + ]); + + Instance::create([ + 'type' => InstanceType::Mastodon, + 'url' => 'https://disabled.example', + 'enabled' => false, + 'interval_seconds' => 600, + ]); + + $enabled = Instance::enabled()->get(); + + $this->assertCount(1, $enabled); + $this->assertSame('https://enabled.example', $enabled->first()->url); + } +} diff --git a/packages/Lvl0/FediDiscover/tests/Feature/LemmyClientTest.php b/packages/Lvl0/FediDiscover/tests/Feature/LemmyClientTest.php new file mode 100644 index 0000000..e06b6b1 --- /dev/null +++ b/packages/Lvl0/FediDiscover/tests/Feature/LemmyClientTest.php @@ -0,0 +1,150 @@ + Http::response([ + 'posts' => [ + $this->lemmyPost( + id: 42, + apId: 'https://lemmy.world/post/42', + name: 'My Great Post', + body: 'Some body text', + published: '2026-04-25T10:00:00.000000', + ), + ], + ], 200), + ]); + + $posts = (new LemmyClient)->fetchPostsSince($this->lemmyInstance(), null); + + $this->assertCount(1, $posts); + $this->assertInstanceOf(FediversePost::class, $posts->first()); + $this->assertSame('42', $posts->first()->cursorId); + $this->assertSame('https://lemmy.world/post/42', $posts->first()->selfUrl); + $this->assertSame('My Great Post', $posts->first()->title); + $this->assertSame('Some body text', $posts->first()->body); + $this->assertSame('2026-04-25T10:00:00.000000', $posts->first()->publishedAt); + } + + public function test_url_field_is_appended_to_body(): void + { + Http::fake([ + '*' => Http::response([ + 'posts' => [ + $this->lemmyPost( + id: 42, + apId: 'https://lemmy.world/post/42', + url: 'https://example-garden.blog/post-42', + body: 'Some original text.', + ), + ], + ], 200), + ]); + + $post = (new LemmyClient)->fetchPostsSince($this->lemmyInstance(), null)->first(); + + $this->assertStringContainsString('Some original text.', $post->body); + $this->assertStringContainsString('https://example-garden.blog/post-42', $post->body); + } + + public function test_url_absent_leaves_body_clean(): void + { + Http::fake([ + '*' => Http::response([ + 'posts' => [ + $this->lemmyPost( + id: 7, + apId: 'https://lemmy.world/post/7', + body: 'Just a regular post.', + ), + ], + ], 200), + ]); + + $post = (new LemmyClient)->fetchPostsSince($this->lemmyInstance(), null)->first(); + + $this->assertSame('Just a regular post.', $post->body); + } + + public function test_it_handles_posts_without_a_body_key(): void + { + Http::fake([ + '*' => Http::response([ + 'posts' => [ + [ + 'post' => [ + 'id' => 99, + 'ap_id' => 'https://lemmy.world/post/99', + 'url' => null, + 'name' => 'Link-only post', + 'published' => '2026-04-25T10:00:00.000000', + // 'body' key intentionally absent — real Lemmy API omits it for link-only posts + ], + ], + ], + ], 200), + ]); + + $post = (new LemmyClient)->fetchPostsSince($this->lemmyInstance(), null)->first(); + + $this->assertNull($post->body); + } + + public function test_it_hits_the_post_list_endpoint_of_the_instance(): void + { + Http::fake([ + 'lemmy.world/api/v3/post/list*' => Http::response(['posts' => []], 200), + ]); + + (new LemmyClient)->fetchPostsSince($this->lemmyInstance(), null); + + Http::assertSent(fn ($request) => str_starts_with($request->url(), 'https://lemmy.world/api/v3/post/list') + && $request->method() === 'GET' + ); + } + + private function lemmyInstance(): Instance + { + return new Instance([ + 'type' => InstanceType::Lemmy, + 'url' => 'https://lemmy.world', + ]); + } + + /** + * @return array + */ + private function lemmyPost( + int $id, + string $apId, + ?string $url = null, + string $body = '', + string $name = 'A post title', + string $published = '2026-04-25T10:00:00.000000', + ): array { + return [ + 'post' => [ + 'id' => $id, + 'ap_id' => $apId, + 'url' => $url, + 'body' => $body, + 'name' => $name, + 'published' => $published, + ], + ]; + } +} diff --git a/packages/Lvl0/FediDiscover/tests/Feature/MastodonClientTest.php b/packages/Lvl0/FediDiscover/tests/Feature/MastodonClientTest.php new file mode 100644 index 0000000..0516bad --- /dev/null +++ b/packages/Lvl0/FediDiscover/tests/Feature/MastodonClientTest.php @@ -0,0 +1,191 @@ + Http::response([], 200), + ]); + + (new MastodonClient)->fetchPostsSince($this->mastodonInstance(), null); + + Http::assertSent(fn ($request) => str_starts_with($request->url(), 'https://mastodon.social/api/v1/timelines/public') + && $request->method() === 'GET' + ); + } + + public function test_it_omits_min_id_on_first_poll(): void + { + Http::fake(['*' => Http::response([], 200)]); + + (new MastodonClient)->fetchPostsSince($this->mastodonInstance(), null); + + Http::assertSent(fn ($request) => ! str_contains($request->url(), 'min_id')); + } + + public function test_it_passes_min_id_on_subsequent_polls(): void + { + Http::fake(['*' => Http::response([], 200)]); + + (new MastodonClient)->fetchPostsSince($this->mastodonInstance(), '109876543210'); + + Http::assertSent(fn ($request) => str_contains($request->url(), 'min_id=109876543210')); + } + + public function test_it_returns_an_empty_collection_when_the_api_returns_no_posts(): void + { + Http::fake(['*' => Http::response([], 200)]); + + $posts = (new MastodonClient)->fetchPostsSince($this->mastodonInstance(), null); + + $this->assertInstanceOf(Collection::class, $posts); + $this->assertTrue($posts->isEmpty()); + } + + public function test_it_maps_each_status_to_a_fediverse_post(): void + { + Http::fake([ + '*' => Http::response([ + $this->mastodonStatus(id: '109876543210', url: 'https://mastodon.social/@alice/109876543210', content: '

Hello

'), + $this->mastodonStatus(id: '109876543211', url: 'https://mastodon.social/@bob/109876543211', content: '

World

'), + ], 200), + ]); + + $posts = (new MastodonClient)->fetchPostsSince($this->mastodonInstance(), null); + + $this->assertCount(2, $posts); + $this->assertInstanceOf(FediversePost::class, $posts->first()); + $this->assertSame('109876543210', $posts->first()->cursorId); + $this->assertSame('https://mastodon.social/@alice/109876543210', $posts->first()->selfUrl); + $this->assertSame('

Hello

', $posts->first()->body); + } + + public function test_it_maps_published_at_from_created_at(): void + { + Http::fake([ + '*' => Http::response([ + $this->mastodonStatus(id: '109876543210', url: 'https://mastodon.social/@alice/109876543210'), + ], 200), + ]); + + $posts = (new MastodonClient)->fetchPostsSince($this->mastodonInstance(), null); + + $this->assertSame('2026-04-25T10:00:00Z', $posts->first()->publishedAt); + } + + public function test_it_sets_title_to_null_for_mastodon_statuses(): void + { + Http::fake([ + '*' => Http::response([ + $this->mastodonStatus(id: '109876543210', url: 'https://mastodon.social/@alice/109876543210'), + ], 200), + ]); + + $posts = (new MastodonClient)->fetchPostsSince($this->mastodonInstance(), null); + + $this->assertNull($posts->first()->title); + } + + public function test_it_falls_back_to_uri_when_url_is_null(): void + { + Http::fake([ + '*' => Http::response([ + $this->mastodonStatus( + id: '109876543210', + url: null, + uri: 'https://hachyderm.io/users/bob/statuses/5678', + content: '

federated post

' + ), + ], 200), + ]); + + $posts = (new MastodonClient)->fetchPostsSince($this->mastodonInstance(), null); + + $this->assertSame('https://hachyderm.io/users/bob/statuses/5678', $posts->first()->selfUrl); + } + + public function test_it_preserves_newest_first_ordering_from_the_api(): void + { + Http::fake([ + '*' => Http::response([ + $this->mastodonStatus(id: '300', url: 'https://mastodon.social/@a/300'), + $this->mastodonStatus(id: '200', url: 'https://mastodon.social/@b/200'), + $this->mastodonStatus(id: '100', url: 'https://mastodon.social/@c/100'), + ], 200), + ]); + + $posts = (new MastodonClient)->fetchPostsSince($this->mastodonInstance(), null); + + $this->assertSame(['300', '200', '100'], $posts->pluck('cursorId')->all()); + } + + public function test_it_returns_an_empty_collection_on_a_non_2xx_response(): void + { + Http::fake(['*' => Http::response('Too many requests', 429)]); + + $posts = (new MastodonClient)->fetchPostsSince($this->mastodonInstance(), null); + + $this->assertInstanceOf(Collection::class, $posts); + $this->assertTrue($posts->isEmpty()); + } + + public function test_it_returns_an_empty_collection_when_the_response_body_is_not_json(): void + { + Http::fake(['*' => Http::response('error', 200)]); + + $posts = (new MastodonClient)->fetchPostsSince($this->mastodonInstance(), null); + + $this->assertInstanceOf(Collection::class, $posts); + $this->assertTrue($posts->isEmpty()); + } + + public function test_it_sends_the_configured_user_agent(): void + { + Http::fake(['*' => Http::response([], 200)]); + + (new MastodonClient)->fetchPostsSince($this->mastodonInstance(), null); + + $expected = config('fedi-discover.http.user_agent'); + Http::assertSent(fn ($request) => $request->header('User-Agent')[0] === $expected); + } + + private function mastodonInstance(): Instance + { + return new Instance([ + 'type' => InstanceType::Mastodon, + 'url' => 'https://mastodon.social', + ]); + } + + /** + * @return array + */ + private function mastodonStatus( + string $id, + ?string $url = null, + ?string $uri = null, + string $content = '

example

', + ): array { + return [ + 'id' => $id, + 'url' => $url, + 'uri' => $uri ?? "https://mastodon.social/users/x/statuses/{$id}", + 'content' => $content, + 'created_at' => '2026-04-25T10:00:00Z', + 'account' => ['acct' => 'alice@mastodon.social'], + ]; + } +} diff --git a/packages/Lvl0/FediDiscover/tests/Feature/PollFediverseActionTest.php b/packages/Lvl0/FediDiscover/tests/Feature/PollFediverseActionTest.php new file mode 100644 index 0000000..0056d44 --- /dev/null +++ b/packages/Lvl0/FediDiscover/tests/Feature/PollFediverseActionTest.php @@ -0,0 +1,268 @@ +poll([ + new FediversePost('1', 'https://mastodon.social/@alice/1', 'See https://example.com/one and https://other.example/two'), + ]); + + Event::assertDispatched(UrlDiscovered::class, fn (UrlDiscovered $e) => $e->url === 'https://example.com/one'); + Event::assertDispatched(UrlDiscovered::class, fn (UrlDiscovered $e) => $e->url === 'https://other.example/two'); + Event::assertDispatchedTimes(UrlDiscovered::class, 2); + } + + public function test_it_extracts_urls_from_html_anchor_tags(): void + { + Event::fake([UrlDiscovered::class]); + + $this->poll([ + new FediversePost('1', 'https://mastodon.social/@alice/1', '

Check this!

'), + ]); + + Event::assertDispatched(UrlDiscovered::class, fn (UrlDiscovered $e) => $e->url === 'https://example.com/article'); + Event::assertDispatchedTimes(UrlDiscovered::class, 1); + } + + public function test_it_extracts_urls_from_markdown_links(): void + { + Event::fake([UrlDiscovered::class]); + + $this->poll( + posts: [new FediversePost('1', 'https://lemmy.world/post/42', 'A [great article](https://example.com/article) about trees.')], + instanceUrl: 'https://lemmy.world', + ); + + Event::assertDispatched(UrlDiscovered::class, fn (UrlDiscovered $e) => $e->url === 'https://example.com/article'); + Event::assertDispatchedTimes(UrlDiscovered::class, 1); + } + + public function test_it_strips_trailing_punctuation_from_urls(): void + { + Event::fake([UrlDiscovered::class]); + + $this->poll([ + new FediversePost('1', 'https://mastodon.social/@alice/1', 'Check https://example.com/article, it is great. Also https://other.example/page.'), + ]); + + Event::assertDispatched(UrlDiscovered::class, fn (UrlDiscovered $e) => $e->url === 'https://example.com/article'); + Event::assertDispatched(UrlDiscovered::class, fn (UrlDiscovered $e) => $e->url === 'https://other.example/page'); + } + + public function test_it_deduplicates_urls_within_a_single_post(): void + { + Event::fake([UrlDiscovered::class]); + + $this->poll([ + new FediversePost('1', 'https://mastodon.social/@alice/1', 'Here is https://example.com/article and again https://example.com/article'), + ]); + + Event::assertDispatchedTimes(UrlDiscovered::class, 1); + } + + public function test_it_filters_urls_on_the_polling_instance_host(): void + { + Event::fake([UrlDiscovered::class]); + + $this->poll([ + new FediversePost('1', 'https://mastodon.social/@alice/1', 'See https://mastodon.social/@bob/42 and https://example.com/article'), + ]); + + Event::assertDispatched(UrlDiscovered::class, fn (UrlDiscovered $e) => $e->url === 'https://example.com/article'); + Event::assertDispatchedTimes(UrlDiscovered::class, 1); + } + + public function test_it_ignores_posts_with_a_null_body(): void + { + Event::fake([UrlDiscovered::class]); + + $this->poll([ + new FediversePost('1', 'https://mastodon.social/@alice/1', null), + ]); + + Event::assertNotDispatched(UrlDiscovered::class); + } + + public function test_it_ignores_non_http_schemes(): void + { + Event::fake([UrlDiscovered::class]); + + $this->poll([ + new FediversePost('1', 'https://mastodon.social/@alice/1', 'Email mailto:alice@example.com or try ftp://files.example.com/x'), + ]); + + Event::assertNotDispatched(UrlDiscovered::class); + } + + public function test_it_passes_post_self_url_and_body_through_to_the_event(): void + { + Event::fake([UrlDiscovered::class]); + + $instance = $this->makeInstance(); + $body = 'Here is https://example.com/article with surrounding context.'; + + $this->pollInstance($instance, [ + new FediversePost('1', 'https://mastodon.social/@alice/1', $body), + ]); + + Event::assertDispatched(UrlDiscovered::class, fn (UrlDiscovered $e) => $e->postUrl === 'https://mastodon.social/@alice/1' + && $e->postBody === $body + && $e->instanceId === $instance->id + && $e->discoveredAt instanceof CarbonImmutable + ); + } + + public function test_it_processes_multiple_posts(): void + { + Event::fake([UrlDiscovered::class]); + + $this->poll([ + new FediversePost('1', 'https://mastodon.social/@alice/1', 'See https://example.com/one'), + new FediversePost('2', 'https://mastodon.social/@bob/2', 'Also https://example.com/two'), + ]); + + Event::assertDispatchedTimes(UrlDiscovered::class, 2); + } + + public function test_it_updates_last_seen_id_to_the_first_posts_cursor(): void + { + $instance = $this->makeInstance(); + + // Clients return newest-first; the action treats posts[0] + // as the new high-water mark without inspecting cursor values. + $this->pollInstance($instance, [ + new FediversePost('newest-cursor', 'https://mastodon.social/@alice/3', 'x'), + new FediversePost('middle-cursor', 'https://mastodon.social/@bob/2', 'y'), + new FediversePost('oldest-cursor', 'https://mastodon.social/@carol/1', 'z'), + ]); + + $this->assertSame('newest-cursor', $instance->fresh()->last_seen_id); + } + + public function test_it_updates_last_polled_at(): void + { + $instance = $this->makeInstance(); + $this->assertNull($instance->last_polled_at); + + $this->pollInstance($instance, [ + new FediversePost('1', 'https://mastodon.social/@alice/1', 'x'), + ]); + + $this->assertNotNull($instance->fresh()->last_polled_at); + } + + public function test_it_passes_the_existing_last_seen_id_to_the_client(): void + { + $instance = $this->makeInstance(['last_seen_id' => '999']); + + $client = Mockery::mock(FediverseClientInterface::class); + $client->shouldReceive('fetchPostsSince') + ->once() + ->with($instance, $instance->last_seen_id) + ->andReturn(collect()); + + $factory = Mockery::mock(FediverseClientFactory::class); + $factory->shouldReceive('for')->with($instance)->andReturn($client); + + (new PollFediverseAction($factory))->execute($instance); + } + + public function test_it_leaves_last_seen_id_unchanged_when_no_posts_are_returned(): void + { + $instance = $this->makeInstance(['last_seen_id' => '500']); + + $this->pollInstance($instance, []); + + $this->assertSame('500', $instance->fresh()->last_seen_id); + } + + public function test_consecutive_poll_failures_reset_to_zero_after_successful_poll(): void + { + $instance = $this->makeInstance(['consecutive_poll_failures' => 5]); + + $this->pollInstance($instance, []); + + $this->assertSame(0, $instance->fresh()->consecutive_poll_failures); + } + + public function test_poll_logs_a_structured_success_entry_with_url_count_and_duration(): void + { + Log::spy(); + Event::fake([UrlDiscovered::class]); + + $instance = $this->makeInstance(); + + $this->pollInstance($instance, [ + new FediversePost('1', 'https://mastodon.social/@alice/1', 'See https://example.com/one and https://other.example/two'), + new FediversePost('2', 'https://mastodon.social/@bob/2', 'Also https://example.com/three'), + ]); + + Log::shouldHaveReceived('info') + ->once() + ->withArgs(function (string $message, array $context) use ($instance): bool { + return $message === 'fedi-discover:poll succeeded' + && $context['instance_id'] === $instance->id + && $context['url_count'] === 3 + && isset($context['duration_ms']) + && $context['duration_ms'] >= 0; + }); + } + + /** + * @param array $posts + */ + private function poll(array $posts, string $instanceUrl = 'https://mastodon.social'): void + { + $this->pollInstance($this->makeInstance(['url' => $instanceUrl]), $posts); + } + + /** + * @param array $posts + */ + private function pollInstance(Instance $instance, array $posts): void + { + $client = Mockery::mock(FediverseClientInterface::class); + $client->shouldReceive('fetchPostsSince')->andReturn(collect($posts)); + + $factory = Mockery::mock(FediverseClientFactory::class); + $factory->shouldReceive('for')->andReturn($client); + + (new PollFediverseAction($factory))->execute($instance); + } + + /** + * @param array $overrides + */ + private function makeInstance(array $overrides = []): Instance + { + return Instance::create(array_merge([ + 'type' => InstanceType::Mastodon, + 'url' => 'https://mastodon.social', + 'enabled' => true, + 'interval_seconds' => 600, + ], $overrides)); + } +} diff --git a/packages/Lvl0/FediDiscover/tests/Feature/PollInstancesCommandTest.php b/packages/Lvl0/FediDiscover/tests/Feature/PollInstancesCommandTest.php new file mode 100644 index 0000000..a449552 --- /dev/null +++ b/packages/Lvl0/FediDiscover/tests/Feature/PollInstancesCommandTest.php @@ -0,0 +1,202 @@ +shouldReceive('fetchPostsSince')->andReturn(collect()); + + $factoryStub = Mockery::mock(FediverseClientFactory::class); + $factoryStub->shouldReceive('for')->andReturn($clientStub); + + $this->app->instance(FediverseClientFactory::class, $factoryStub); + } + + public function test_it_exits_zero_when_there_are_no_enabled_instances(): void + { + $this->artisan('fedi-discover:poll') + ->assertExitCode(0); + } + + public function test_it_calls_the_action_for_each_enabled_instance_and_skips_disabled(): void + { + $enabled1 = Instance::create([ + 'type' => InstanceType::Mastodon, + 'url' => 'https://mastodon.social', + 'enabled' => true, + 'interval_seconds' => 600, + ]); + + $enabled2 = Instance::create([ + 'type' => InstanceType::Mastodon, + 'url' => 'https://fosstodon.org', + 'enabled' => true, + 'interval_seconds' => 600, + ]); + + Instance::create([ + 'type' => InstanceType::Mastodon, + 'url' => 'https://disabled.example', + 'enabled' => false, + 'interval_seconds' => 600, + ]); + + $calledWith = []; + + $action = Mockery::mock(PollFediverseAction::class); + $action->shouldReceive('execute') + ->twice() + ->withArgs(function (Instance $instance) use (&$calledWith): bool { + $calledWith[] = $instance->url; + + return true; + }); + + $this->app->instance(PollFediverseAction::class, $action); + + $this->artisan('fedi-discover:poll')->assertExitCode(0); + + $this->assertEqualsCanonicalizing( + [$enabled1->url, $enabled2->url], + $calledWith, + ); + } + + public function test_one_instance_throwing_does_not_stop_remaining_instances_from_being_polled(): void + { + Instance::create([ + 'type' => InstanceType::Mastodon, + 'url' => 'https://failing.example', + 'enabled' => true, + 'interval_seconds' => 600, + ]); + + $healthy = Instance::create([ + 'type' => InstanceType::Mastodon, + 'url' => 'https://healthy.example', + 'enabled' => true, + 'interval_seconds' => 600, + ]); + + $calledWith = []; + + $action = Mockery::mock(PollFediverseAction::class); + $action->shouldReceive('execute') + ->twice() + ->andReturnUsing(function (Instance $instance) use (&$calledWith): void { + $calledWith[] = $instance->url; + + if ($instance->url === 'https://failing.example') { + throw new RuntimeException('Connection refused'); + } + }); + + $this->app->instance(PollFediverseAction::class, $action); + + $this->artisan('fedi-discover:poll')->assertExitCode(1); + + $this->assertEqualsCanonicalizing( + ['https://failing.example', $healthy->url], + $calledWith, + ); + } + + public function test_poll_failed_event_is_dispatched_when_action_throws(): void + { + Event::fake([PollFailed::class]); + + $instance = Instance::create([ + 'type' => InstanceType::Mastodon, + 'url' => 'https://failing.example', + 'enabled' => true, + 'interval_seconds' => 600, + ]); + + $action = Mockery::mock(PollFediverseAction::class); + $action->shouldReceive('execute') + ->once() + ->andReturnUsing(function (): void { + throw new RuntimeException('Connection refused'); + }); + + $this->app->instance(PollFediverseAction::class, $action); + + $this->artisan('fedi-discover:poll'); + + Event::assertDispatched(PollFailed::class, function (PollFailed $event) use ($instance): bool { + return $event->instance->id === $instance->id + && $event->message === 'Connection refused'; + }); + } + + public function test_poll_failed_event_is_not_dispatched_on_a_successful_poll(): void + { + Event::fake([PollFailed::class]); + + Instance::create([ + 'type' => InstanceType::Mastodon, + 'url' => 'https://healthy.example', + 'enabled' => true, + 'interval_seconds' => 600, + ]); + + // setUp() already binds a no-op action stub via the factory; no override needed. + + $this->artisan('fedi-discover:poll'); + + Event::assertNotDispatched(PollFailed::class); + } + + public function test_it_exits_one_when_at_least_one_instance_fails(): void + { + Instance::create([ + 'type' => InstanceType::Mastodon, + 'url' => 'https://failing.example', + 'enabled' => true, + 'interval_seconds' => 600, + ]); + + Instance::create([ + 'type' => InstanceType::Mastodon, + 'url' => 'https://healthy.example', + 'enabled' => true, + 'interval_seconds' => 600, + ]); + + $action = Mockery::mock(PollFediverseAction::class); + $action->shouldReceive('execute') + ->twice() + ->andReturnUsing(function (Instance $instance): void { + if ($instance->url === 'https://failing.example') { + throw new RuntimeException('Connection refused'); + } + }); + + $this->app->instance(PollFediverseAction::class, $action); + + $this->artisan('fedi-discover:poll')->assertExitCode(1); + } +} diff --git a/packages/Lvl0/FediDiscover/tests/Feature/ValidateInstancesCommandTest.php b/packages/Lvl0/FediDiscover/tests/Feature/ValidateInstancesCommandTest.php new file mode 100644 index 0000000..878d690 --- /dev/null +++ b/packages/Lvl0/FediDiscover/tests/Feature/ValidateInstancesCommandTest.php @@ -0,0 +1,221 @@ +artisan('fedi-discover:validate') + ->assertExitCode(0); + } + + public function test_it_exits_zero_when_all_instances_are_valid(): void + { + Instance::create([ + 'type' => InstanceType::Mastodon, + 'url' => 'https://mastodon.social', + 'enabled' => true, + 'interval_seconds' => 600, + 'extras' => [], + ]); + + $this->artisan('fedi-discover:validate') + ->assertExitCode(0); + } + + public function test_it_exits_nonzero_when_a_row_has_an_invalid_url(): void + { + Instance::create([ + 'type' => InstanceType::Mastodon, + 'url' => 'not-a-url', + 'enabled' => true, + 'interval_seconds' => 600, + 'extras' => [], + ]); + + $this->artisan('fedi-discover:validate') + ->assertExitCode(1); + } + + public function test_it_exits_nonzero_when_a_row_has_a_zero_interval(): void + { + Instance::create([ + 'type' => InstanceType::Mastodon, + 'url' => 'https://mastodon.social', + 'enabled' => true, + 'interval_seconds' => 0, + 'extras' => [], + ]); + + $this->artisan('fedi-discover:validate') + ->assertExitCode(1); + } + + public function test_it_reports_summary_of_valid_and_invalid_counts(): void + { + Instance::create([ + 'type' => InstanceType::Mastodon, + 'url' => 'https://mastodon.social', + 'enabled' => true, + 'interval_seconds' => 600, + 'extras' => [], + ]); + + Instance::create([ + 'type' => InstanceType::Mastodon, + 'url' => 'https://hachyderm.io', + 'enabled' => true, + 'interval_seconds' => 600, + 'extras' => [], + ]); + + Instance::create([ + 'type' => InstanceType::Mastodon, + 'url' => 'bogus', + 'enabled' => true, + 'interval_seconds' => 600, + 'extras' => [], + ]); + + $this->artisan('fedi-discover:validate') + ->expectsOutputToContain('3') + ->expectsOutputToContain('2 valid') + ->expectsOutputToContain('1 invalid') + ->assertExitCode(1); + } + + public function test_it_does_not_fail_fast_and_reports_every_invalid_row(): void + { + Instance::create([ + 'type' => InstanceType::Mastodon, + 'url' => 'bogus-one', + 'enabled' => true, + 'interval_seconds' => 600, + 'extras' => [], + ]); + + $second = Instance::create([ + 'type' => InstanceType::Mastodon, + 'url' => 'https://mastodon.social', + 'enabled' => true, + 'interval_seconds' => 0, + 'extras' => [], + ]); + + $this->artisan('fedi-discover:validate') + ->expectsOutputToContain('bogus-one') + ->expectsOutputToContain((string) $second->id) + ->assertExitCode(1); + } + + public function test_it_includes_the_validation_error_message_for_each_invalid_row(): void + { + Instance::create([ + 'type' => InstanceType::Mastodon, + 'url' => 'not-a-url', + 'enabled' => true, + 'interval_seconds' => 600, + 'extras' => [], + ]); + + $this->artisan('fedi-discover:validate') + ->expectsOutputToContain('Invalid URL: not-a-url') + ->assertExitCode(1); + } + + public function test_summary_counts_are_accurate_when_mixed(): void + { + // 2 valid + Instance::create([ + 'type' => InstanceType::Mastodon, + 'url' => 'https://mastodon.social', + 'enabled' => true, + 'interval_seconds' => 600, + 'extras' => [], + ]); + Instance::create([ + 'type' => InstanceType::Mastodon, + 'url' => 'https://hachyderm.io', + 'enabled' => true, + 'interval_seconds' => 600, + 'extras' => [], + ]); + + // 3 invalid (different defects) + Instance::create([ + 'type' => InstanceType::Mastodon, + 'url' => 'bogus-one', + 'enabled' => true, + 'interval_seconds' => 600, + 'extras' => [], + ]); + Instance::create([ + 'type' => InstanceType::Mastodon, + 'url' => 'https://fosstodon.org', + 'enabled' => true, + 'interval_seconds' => 0, + 'extras' => [], + ]); + Instance::create([ + 'type' => InstanceType::Mastodon, + 'url' => 'also-bad', + 'enabled' => true, + 'interval_seconds' => -5, + 'extras' => [], + ]); + + $this->artisan('fedi-discover:validate') + ->expectsOutputToContain('5') + ->expectsOutputToContain('2 valid') + ->expectsOutputToContain('3 invalid') + ->assertExitCode(1); + } + + public function test_it_exits_zero_with_enabled_only_when_no_enabled_instances_exist(): void + { + Instance::create([ + 'type' => InstanceType::Mastodon, + 'url' => 'https://mastodon.social', + 'enabled' => false, + 'interval_seconds' => 600, + 'extras' => [], + ]); + + $this->artisan('fedi-discover:validate', ['--enabled-only' => true]) + ->assertExitCode(0); + } + + public function test_it_exits_zero_with_an_enabled_only_flag_when_disabled_rows_are_invalid(): void + { + // A disabled row that would fail InstanceConfig validation + Instance::create([ + 'type' => InstanceType::Mastodon, + 'url' => 'broken-and-disabled', + 'enabled' => false, + 'interval_seconds' => 0, + 'extras' => [], + ]); + + // A valid enabled row + Instance::create([ + 'type' => InstanceType::Mastodon, + 'url' => 'https://mastodon.social', + 'enabled' => true, + 'interval_seconds' => 600, + 'extras' => [], + ]); + + $this->artisan('fedi-discover:validate', ['--enabled-only' => true]) + ->assertExitCode(0); + } +} diff --git a/packages/Lvl0/FediDiscover/tests/Unit/InstanceConfigTest.php b/packages/Lvl0/FediDiscover/tests/Unit/InstanceConfigTest.php new file mode 100644 index 0000000..03ffbf7 --- /dev/null +++ b/packages/Lvl0/FediDiscover/tests/Unit/InstanceConfigTest.php @@ -0,0 +1,121 @@ + 'mastodon', + 'url' => 'https://mastodon.social', + 'enabled' => true, + 'interval_seconds' => 600, + 'extras' => ['token' => 'abc123'], + ]); + + $this->assertSame(InstanceType::Mastodon, $config->type); + $this->assertSame('https://mastodon.social', $config->url); + $this->assertTrue($config->enabled); + $this->assertSame(600, $config->intervalSeconds); + $this->assertSame(['token' => 'abc123'], $config->extras); + } + + public function test_from_array_rejects_non_positive_interval_seconds(): void + { + $this->expectException(\InvalidArgumentException::class); + + InstanceConfig::fromArray([ + 'type' => 'mastodon', + 'url' => 'https://mastodon.social', + 'enabled' => true, + 'interval_seconds' => 0, + 'extras' => [], + ]); + } + + public function test_extras_defaults_to_empty_array_when_omitted(): void + { + $config = InstanceConfig::fromArray([ + 'type' => 'mastodon', + 'url' => 'https://mastodon.social', + 'enabled' => true, + 'interval_seconds' => 600, + ]); + + $this->assertSame([], $config->extras); + } + + #[DataProvider('requiredKeyProvider')] + public function test_from_array_throws_when_required_key_is_missing(string $missingKey): void + { + $input = [ + 'type' => 'mastodon', + 'url' => 'https://mastodon.social', + 'enabled' => true, + 'interval_seconds' => 600, + ]; + + unset($input[$missingKey]); + + $this->expectException(\InvalidArgumentException::class); + $this->expectExceptionMessageMatches('/' . preg_quote($missingKey, '/') . '/'); + + InstanceConfig::fromArray($input); + } + + public static function requiredKeyProvider(): array + { + return [ + 'type missing' => ['type'], + 'url missing' => ['url'], + 'enabled missing' => ['enabled'], + 'interval_seconds missing' => ['interval_seconds'], + ]; + } + + public function test_from_array_throws_invalid_argument_exception_for_unknown_type_string(): void + { + $this->expectException(\InvalidArgumentException::class); + $this->expectExceptionMessageMatches('/pleroma/'); + + InstanceConfig::fromArray([ + 'type' => 'pleroma', + 'url' => 'https://pleroma.example.com', + 'enabled' => true, + 'interval_seconds' => 600, + ]); + } + + public function test_from_array_rejects_malformed_url(): void + { + $this->expectException(\InvalidArgumentException::class); + + InstanceConfig::fromArray([ + 'type' => 'mastodon', + 'url' => 'not a url', + 'enabled' => true, + 'interval_seconds' => 600, + ]); + } + + public function test_to_array_produces_array_that_round_trips_through_from_array(): void + { + $original = [ + 'type' => 'mastodon', + 'url' => 'https://mastodon.social', + 'enabled' => true, + 'interval_seconds' => 600, + 'extras' => ['token' => 'abc123'], + ]; + + $this->assertSame($original, InstanceConfig::fromArray($original)->toArray()); + } +} diff --git a/packages/Lvl0/FediDiscover/tests/Unit/PollFailedTest.php b/packages/Lvl0/FediDiscover/tests/Unit/PollFailedTest.php new file mode 100644 index 0000000..d5ebf8d --- /dev/null +++ b/packages/Lvl0/FediDiscover/tests/Unit/PollFailedTest.php @@ -0,0 +1,31 @@ +id = 7; + + $failedAt = CarbonImmutable::parse('2026-04-28T09:00:00'); + + $event = new PollFailed( + instance: $instance, + message: 'Connection timed out', + failedAt: $failedAt, + ); + + $this->assertSame($instance, $event->instance); + $this->assertSame('Connection timed out', $event->message); + $this->assertTrue($failedAt->eq($event->failedAt)); + } +} diff --git a/packages/Lvl0/FediDiscover/tests/Unit/UrlDiscoveredTest.php b/packages/Lvl0/FediDiscover/tests/Unit/UrlDiscoveredTest.php new file mode 100644 index 0000000..a16c795 --- /dev/null +++ b/packages/Lvl0/FediDiscover/tests/Unit/UrlDiscoveredTest.php @@ -0,0 +1,44 @@ +assertSame('https://example.com/article', $event->url); + $this->assertSame(42, $event->instanceId); + $this->assertTrue($discoveredAt->eq($event->discoveredAt)); + $this->assertSame('https://mastodon.social/@alice/109876543210', $event->postUrl); + $this->assertSame('Check out this article: https://example.com/article', $event->postBody); + } + + public function test_post_body_is_nullable(): void + { + $event = new UrlDiscovered( + url: 'https://example.com/article', + instanceId: 1, + discoveredAt: CarbonImmutable::parse('2026-04-26T12:00:00'), + postUrl: 'https://mastodon.social/@alice/109876543210', + postBody: null + ); + + $this->assertNull($event->postBody); + } +} diff --git a/phpunit.xml b/phpunit.xml index 46d97dd..ac75c66 100644 --- a/phpunit.xml +++ b/phpunit.xml @@ -3,6 +3,11 @@ xsi:noNamespaceSchemaLocation="vendor/phpunit/phpunit/phpunit.xsd" bootstrap="vendor/autoload.php" colors="true" + processIsolation="false" + displayDetailsOnPhpunitDeprecations="true" + displayDetailsOnTestsThatTriggerErrors="true" + displayDetailsOnTestsThatTriggerWarnings="true" + displayDetailsOnTestsThatTriggerNotices="true" > @@ -22,19 +27,21 @@ - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + diff --git a/pint.json b/pint.json new file mode 100644 index 0000000..ae7601d --- /dev/null +++ b/pint.json @@ -0,0 +1,8 @@ +{ + "preset": "laravel", + "rules": { + "concat_space": { + "spacing": "one" + } + } +} diff --git a/public/index.php b/public/index.php index ee8f07e..86bfe78 100644 --- a/public/index.php +++ b/public/index.php @@ -6,15 +6,15 @@ define('LARAVEL_START', microtime(true)); // Determine if the application is in maintenance mode... -if (file_exists($maintenance = __DIR__.'/../storage/framework/maintenance.php')) { +if (file_exists($maintenance = __DIR__ . '/../storage/framework/maintenance.php')) { require $maintenance; } // Register the Composer autoloader... -require __DIR__.'/../vendor/autoload.php'; +require __DIR__ . '/../vendor/autoload.php'; // Bootstrap Laravel and handle the request... /** @var Application $app */ -$app = require_once __DIR__.'/../bootstrap/app.php'; +$app = require_once __DIR__ . '/../bootstrap/app.php'; $app->handleRequest(Request::capture()); diff --git a/resources/views/admin/index.blade.php b/resources/views/admin/index.blade.php new file mode 100644 index 0000000..033c409 --- /dev/null +++ b/resources/views/admin/index.blade.php @@ -0,0 +1,29 @@ +@extends('layouts.app') + +@section('content') +
+

Instances

+ + + + + + + + + + + + @foreach($instances as $instance) + + + + + + + @endforeach + +
InstanceLast polled atURLsErrors
{{ $instance->url }}{{ $instance->last_polled_at }}{{ $instance->pages_count }} URLs{{ $instance->failed_pages_count }} errors
+
+@endsection + diff --git a/resources/views/bot.blade.php b/resources/views/bot.blade.php new file mode 100644 index 0000000..ad4ef5d --- /dev/null +++ b/resources/views/bot.blade.php @@ -0,0 +1,63 @@ +@extends('layouts.app') + +@section('content') + +
+

About TroveBot

+ +

+ Trove is a federated search engine for the small web, + seeded by fediverse attention and ranked by domain coherence rather than + commercial authority. TroveBot is its crawler — it + discovers and indexes URLs shared by people on the fediverse, then + follows the citations they make to find more of the small web. +

+ +

Identity

+ +

TroveBot identifies itself with the following User-Agent string:

+ +
TroveBot/0.1 (+https://trove.lvl0.xyz/bot)
+ +

Crawling behavior

+ +
    +
  • Respects robots.txt rules under User-agent: TroveBot (and the wildcard User-agent: * as a fallback).
  • +
  • Polite per-domain rate limit — at most a few requests per minute per host.
  • +
  • Follows up to 5 redirects per URL.
  • +
  • Fetches HTML only. PDFs, images, and other binary content are recorded as discovered but never re-fetched.
  • +
  • Does not execute JavaScript, does not crawl behind authentication, does not crawl URLs containing user credentials.
  • +
+ +

Opt out

+ +

+ Block TroveBot entirely by adding the following to your site's + robots.txt: +

+ +
User-agent: TroveBot
+Disallow: /
+ +

+ Or block specific paths: +

+ +
User-agent: TroveBot
+Disallow: /private/
+Disallow: /admin/
+ +

Contact & source

+ + +
+@endsection diff --git a/resources/views/layouts/app.blade.php b/resources/views/layouts/app.blade.php new file mode 100644 index 0000000..97a6501 --- /dev/null +++ b/resources/views/layouts/app.blade.php @@ -0,0 +1,18 @@ + + + + + + + Trove @yield('title', config('app.name')) + + @vite(['resources/css/app.css', 'resources/js/app.js']) + + @livewireStyles + + + @yield('content') + + @livewireScripts + + diff --git a/resources/views/livewire/url-submission-form.blade.php b/resources/views/livewire/url-submission-form.blade.php new file mode 100644 index 0000000..49da751 --- /dev/null +++ b/resources/views/livewire/url-submission-form.blade.php @@ -0,0 +1,14 @@ +
+ @error('rate_limit')

{{ $message }}

@enderror + + @if ($confirmedUrl !== null) +

Thanks, we've received {{ $confirmedUrl }}

+ @else +
+ + + @error('url')

{{ $message }}

@enderror + +
+ @endif +
diff --git a/resources/views/urls/submit.blade.php b/resources/views/urls/submit.blade.php new file mode 100644 index 0000000..1385d93 --- /dev/null +++ b/resources/views/urls/submit.blade.php @@ -0,0 +1,7 @@ +@extends('layouts.app') + +@section('content') + + + +@endsection diff --git a/resources/views/welcome.blade.php b/resources/views/welcome.blade.php index 2c2e7c2..9c0de81 100644 --- a/resources/views/welcome.blade.php +++ b/resources/views/welcome.blade.php @@ -1,225 +1,5 @@ - - - - - +@extends('layouts.app') - {{ config('app.name', 'Laravel') }} - - - - - - - @if (file_exists(public_path('build/manifest.json')) || file_exists(public_path('hot'))) - @vite(['resources/css/app.css', 'resources/js/app.js']) - @else - - @endif - - -
- @if (Route::has('login')) - - @endif -
-
-
-
-

Let's get started

-

With so many options available to you,
we suggest you start with the following:

- - - -

- v{{ app()->version() }} - - View changelog - - - - -

-
-
- {{-- Laravel Logo --}} - - - - - - - - - - - {{-- 13 --}} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
-
-
- - @if (Route::has('login')) - - @endif - - +@section('content') + Welcome +@endsection diff --git a/routes/console.php b/routes/console.php index 3c9adf1..cbb33b7 100644 --- a/routes/console.php +++ b/routes/console.php @@ -1,8 +1,8 @@ comment(Inspiring::quote()); -})->purpose('Display an inspiring quote'); +Schedule::command('fedi-discover:poll') + ->everyMinute() + ->withoutOverlapping(5) + ->runInBackground(); diff --git a/routes/web.php b/routes/web.php index 86a06c5..6b7b768 100644 --- a/routes/web.php +++ b/routes/web.php @@ -1,7 +1,16 @@ name('admin.instances'); diff --git a/shell.nix b/shell.nix index b07745c..c17534f 100644 --- a/shell.nix +++ b/shell.nix @@ -92,6 +92,10 @@ pkgs.mkShell { podman-compose -f $COMPOSE_FILE exec app php artisan "$@" } + dev-composer() { + podman-compose -f $COMPOSE_FILE exec app composer "$@" + } + # =================== # BUILD COMMANDS # =================== @@ -141,6 +145,7 @@ pkgs.mkShell { echo " dev-logs-redis Tail Redis logs" echo " dev-shell Shell into app container" echo " dev-artisan Run artisan command" + echo " dev-composer Run composer command" echo " base-build Build and push image" echo "" echo "Services:" diff --git a/tests/Feature/Actions/FetchPageActionTest.php b/tests/Feature/Actions/FetchPageActionTest.php new file mode 100644 index 0000000..826c755 --- /dev/null +++ b/tests/Feature/Actions/FetchPageActionTest.php @@ -0,0 +1,511 @@ + Http::response( + 'Hello', + 200, + ['Content-Type' => 'text/html'], + ), + ]); + + $result = $this->makeAction()('https://example.com/page'); + + $this->assertInstanceOf(FetchResult::class, $result); + $this->assertSame(CrawlOutcomeEnum::Success, $result->outcome); + $this->assertSame(200, $result->statusCode); + $this->assertNotNull($result->finalUrl); + } + + public function test_4xx_response_returns_blocked_4xx(): void + { + Http::fake([ + 'example.com/*' => Http::response('Not Found', 404), + ]); + + $result = $this->makeAction()('https://example.com/missing'); + + $this->assertInstanceOf(FetchResult::class, $result); + $this->assertSame(CrawlOutcomeEnum::Blocked4xx, $result->outcome); + $this->assertSame(404, $result->statusCode); + $this->assertIsString($result->errorMessage); + $this->assertStringContainsString('404', $result->errorMessage); + $this->assertNotNull($result->finalUrl); + } + + public function test_5xx_response_returns_blocked_5xx(): void + { + Http::fake([ + 'example.com/*' => Http::response('Service Unavailable', 503), + ]); + + $result = $this->makeAction()('https://example.com/page'); + + $this->assertInstanceOf(FetchResult::class, $result); + $this->assertSame(CrawlOutcomeEnum::Blocked5xx, $result->outcome); + $this->assertSame(503, $result->statusCode); + $this->assertIsString($result->errorMessage); + $this->assertStringContainsString('503', $result->errorMessage); + $this->assertNotNull($result->finalUrl); + } + + public function test_non_html_content_type_returns_rejected(): void + { + Http::fake([ + 'example.com/*' => Http::response( + 'PDF binary stuff', + 200, + ['Content-Type' => 'application/pdf'], + ), + ]); + + $result = $this->makeAction()('https://example.com/document.pdf'); + + $this->assertInstanceOf(FetchResult::class, $result); + $this->assertSame(CrawlOutcomeEnum::Rejected, $result->outcome); + $this->assertSame(200, $result->statusCode); + $this->assertIsString($result->errorMessage); + $this->assertStringContainsString('application/pdf', $result->errorMessage); + $this->assertNotNull($result->finalUrl); + $this->assertNull($result->title); + $this->assertNull($result->extractedText); + $this->assertEmpty($result->outboundLinks); + $this->assertNull($result->wordCount); + } + + public function test_text_html_with_charset_is_accepted(): void + { + Http::fake([ + 'example.com/*' => Http::response( + 'Hello charset world', + 200, + ['Content-Type' => 'text/html; charset=utf-8'], + ), + ]); + + $result = $this->makeAction()('https://example.com/page'); + + $this->assertInstanceOf(FetchResult::class, $result); + $this->assertSame(CrawlOutcomeEnum::Success, $result->outcome); + $this->assertSame(200, $result->statusCode); + } + + public function test_connection_failure_returns_failed(): void + { + Http::fake(function () { + throw new ConnectException( + 'Could not resolve host', + new Request('GET', 'https://example.com/page'), + null, + ['errno' => 6], + ); + }); + + $result = $this->makeAction()('https://example.com/page'); + + $this->assertInstanceOf(FetchResult::class, $result); + $this->assertSame(CrawlOutcomeEnum::Failed, $result->outcome); + $this->assertNull($result->statusCode); + $this->assertNull($result->finalUrl); + $this->assertIsString($result->errorMessage); + $this->assertNull($result->title); + $this->assertNull($result->extractedText); + $this->assertEmpty($result->outboundLinks); + $this->assertNull($result->wordCount); + } + + public function test_timeout_returns_timeout(): void + { + Http::fake(function () { + throw new ConnectException( + 'cURL error 28: Operation timed out', + new Request('GET', 'https://example.com/page'), + null, + ['errno' => 28], + ); + }); + + $result = $this->makeAction()('https://example.com/page'); + + $this->assertInstanceOf(FetchResult::class, $result); + $this->assertSame(CrawlOutcomeEnum::Timeout, $result->outcome); + $this->assertNull($result->statusCode); + $this->assertNull($result->finalUrl); + $this->assertIsString($result->errorMessage); + } + + public function test_success_extracts_title_from_html(): void + { + Http::fake([ + 'example.com/*' => Http::response( + 'My Page Title

Some content.

', + 200, + ['Content-Type' => 'text/html'], + ), + ]); + + $result = $this->makeAction()('https://example.com/page'); + + $this->assertSame(CrawlOutcomeEnum::Success, $result->outcome); + $this->assertSame('My Page Title', $result->title); + } + + public function test_success_extracts_main_text(): void + { + $html = <<<'HTML' + + + Article Title + + +
+

The Real Article

+

This is the main article body that should be extracted by readability.

+

Multiple paragraphs prove the extractor works on the full content.

+
+
Site footer noise
+ + + HTML; + + Http::fake([ + 'example.com/*' => Http::response($html, 200, ['Content-Type' => 'text/html']), + ]); + + $result = $this->makeAction()('https://example.com/article'); + + $this->assertSame(CrawlOutcomeEnum::Success, $result->outcome); + $this->assertNotNull($result->extractedText); + $this->assertStringContainsString('main article body', $result->extractedText); + } + + public function test_success_extracts_and_filters_outbound_links(): void + { + $html = <<<'HTML' + + + Article With Links + + + + + + + HTML; + + Http::fake([ + 'example.com/*' => Http::response($html, 200, ['Content-Type' => 'text/html']), + ]); + + $result = $this->makeAction()('https://example.com/article'); + + $this->assertSame(CrawlOutcomeEnum::Success, $result->outcome); + $this->assertInstanceOf(Collection::class, $result->outboundLinks); + $this->assertSame(2, $result->outboundLinks->count()); + $this->assertContains('https://other.com/article', $result->outboundLinks->all()); + $this->assertContains('https://example.com/related-post', $result->outboundLinks->all()); + $this->assertNotContains('http://192.168.1.1/admin', $result->outboundLinks->all()); + $this->assertNotContains('https://user:pass@evil.com/', $result->outboundLinks->all()); + $this->assertNotContains('ftp://files.example.com/', $result->outboundLinks->all()); + } + + public function test_success_calculates_word_count(): void + { + $html = <<<'HTML' + + + Word Count Test + +
+

This article body has exactly nine words total here.

+
+ + + HTML; + + Http::fake([ + 'example.com/*' => Http::response($html, 200, ['Content-Type' => 'text/html']), + ]); + + $result = $this->makeAction()('https://example.com/article'); + + $this->assertSame(CrawlOutcomeEnum::Success, $result->outcome); + $this->assertSame(9, $result->wordCount); + } + + public function test_uppercase_content_type_is_accepted_as_html(): void + { + Http::fake([ + 'example.com/*' => Http::response( + 'Uppercase CT

Content here.

', + 200, + ['Content-Type' => 'Text/HTML; charset=utf-8'], + ), + ]); + + $result = $this->makeAction()('https://example.com/page'); + + $this->assertSame(CrawlOutcomeEnum::Success, $result->outcome); + } + + public function test_empty_href_is_filtered_from_outbound_links(): void + { + $html = <<<'HTML' + + + Empty Href Test + + + + + HTML; + + Http::fake([ + 'example.com/*' => Http::response($html, 200, ['Content-Type' => 'text/html']), + ]); + + $result = $this->makeAction()('https://example.com/article'); + + $this->assertSame(CrawlOutcomeEnum::Success, $result->outcome); + $this->assertSame(0, $result->outboundLinks->count()); + } + + public function test_fragment_only_href_is_filtered_from_outbound_links(): void + { + $html = <<<'HTML' + + + Fragment Href Test + + + + + HTML; + + Http::fake([ + 'example.com/*' => Http::response($html, 200, ['Content-Type' => 'text/html']), + ]); + + $result = $this->makeAction()('https://example.com/article'); + + $this->assertSame(CrawlOutcomeEnum::Success, $result->outcome); + $this->assertSame(0, $result->outboundLinks->count()); + } + + public function test_sufficient_text_triggers_language_detection_and_result_propagates(): void + { + // 24 words — above the detection threshold + $body = <<<'HTML' + + + Language Detection Test + +
+

The quick brown fox jumps over the lazy dog and then runs away into the forest + where many other animals live and play together every single day.

+
+ + + HTML; + + Http::fake([ + 'example.com/*' => Http::response($body, 200, ['Content-Type' => 'text/html']), + ]); + + $detection = $this->mock(LanguageDetectionService::class); + $detection->shouldReceive('detect') + ->once() + ->andReturn(['en', 0.95]); + + $result = $this->makeAction()('https://example.com/article'); + + $this->assertSame(CrawlOutcomeEnum::Success, $result->outcome); + $this->assertSame('en', $result->language); + $this->assertSame(0.95, $result->languageConfidence); + } + + public function test_short_body_with_html_lang_attr_skips_service_and_uses_lang_attr(): void + { + // 7 words — below the detection threshold + $html = <<<'HTML' + + + Short Page + +
+

Too short to detect language automatically.

+
+ + + HTML; + + Http::fake([ + 'example.com/*' => Http::response($html, 200, ['Content-Type' => 'text/html']), + ]); + + $detection = $this->mock(LanguageDetectionService::class); + $detection->shouldReceive('detect')->never(); + + $result = $this->makeAction()('https://example.com/article'); + + $this->assertSame(CrawlOutcomeEnum::Success, $result->outcome); + $this->assertSame('pt-BR', $result->language); + $this->assertSame(1.0, $result->languageConfidence); + } + + public function test_short_body_with_no_lang_attr_returns_null_language(): void + { + // 7 words — below the detection threshold + $html = <<<'HTML' + + + Short Page + +
+

Too short to detect language automatically.

+
+ + + HTML; + + Http::fake([ + 'example.com/*' => Http::response($html, 200, ['Content-Type' => 'text/html']), + ]); + + $detection = $this->mock(LanguageDetectionService::class); + $detection->shouldReceive('detect')->never(); + + $result = $this->makeAction()('https://example.com/article'); + + $this->assertSame(CrawlOutcomeEnum::Success, $result->outcome); + $this->assertNull($result->language); + $this->assertNull($result->languageConfidence); + } + + public function test_whitespace_only_lang_attr_is_treated_as_absent(): void + { + // 7 words — below the detection threshold; lang attr is blank/whitespace-only + $html = <<<'HTML' + + + Short Page + +
+

Too short to detect language automatically.

+
+ + + HTML; + + Http::fake([ + 'example.com/*' => Http::response($html, 200, ['Content-Type' => 'text/html']), + ]); + + $detection = $this->mock(LanguageDetectionService::class); + $detection->shouldReceive('detect')->never(); + + $result = $this->makeAction()('https://example.com/article'); + + $this->assertSame(CrawlOutcomeEnum::Success, $result->outcome); + $this->assertNull($result->language); + $this->assertNull($result->languageConfidence); + } + + public function test_lang_attr_longer_than_35_chars_is_rejected(): void + { + // 7 words — below the detection threshold; lang attr exceeds BCP-47 column width (string(35)) + $html = <<<'HTML' + + + Short Page + +
+

Too short to detect language automatically.

+
+ + + HTML; + + Http::fake([ + 'example.com/*' => Http::response($html, 200, ['Content-Type' => 'text/html']), + ]); + + $detection = $this->mock(LanguageDetectionService::class); + $detection->shouldReceive('detect')->never(); + + $result = $this->makeAction()('https://example.com/article'); + + $this->assertSame(CrawlOutcomeEnum::Success, $result->outcome); + $this->assertNull($result->language); + $this->assertNull($result->languageConfidence); + } + + public function test_low_confidence_detection_falls_through_to_lang_attr(): void + { + // 24 words — above the detection threshold; service returns low-confidence result + $html = <<<'HTML' + + + Confidence Floor Test + +
+

The quick brown fox jumps over the lazy dog and then runs away into the forest + where many other animals live and play together every single day.

+
+ + + HTML; + + Http::fake([ + 'example.com/*' => Http::response($html, 200, ['Content-Type' => 'text/html']), + ]); + + $detection = $this->mock(LanguageDetectionService::class); + $detection->shouldReceive('detect') + ->once() + ->andReturn(['xx', 0.15]); + + $result = $this->makeAction()('https://example.com/article'); + + $this->assertSame(CrawlOutcomeEnum::Success, $result->outcome); + $this->assertSame('en-US', $result->language); + $this->assertSame(1.0, $result->languageConfidence); + } + + private function makeAction(): FetchPageAction + { + return app(FetchPageAction::class); + } +} diff --git a/tests/Feature/Admin/InstancesAdminPageTest.php b/tests/Feature/Admin/InstancesAdminPageTest.php new file mode 100644 index 0000000..fb633d7 --- /dev/null +++ b/tests/Feature/Admin/InstancesAdminPageTest.php @@ -0,0 +1,133 @@ +get('/admin/instances'); + + $response->assertStatus(200); + } + + public function test_admin_instances_page_shows_each_instance_url_and_last_polled_at(): void + { + $mastodon = Instance::factory() + ->type(InstanceType::Mastodon) + ->enabled() + ->create([ + 'url' => 'https://mastodon.social', + 'last_polled_at' => '2024-06-01 12:00:00', + ]); + + $lemmy = Instance::factory() + ->type(InstanceType::Lemmy) + ->enabled() + ->create([ + 'url' => 'https://lemmy.world', + 'last_polled_at' => '2024-06-01 13:00:00', + ]); + + $response = $this->get('/admin/instances'); + + $response->assertSee($mastodon->url); + $response->assertSee($lemmy->url); + $response->assertSee($mastodon->last_polled_at->toDateString()); + $response->assertSee($lemmy->last_polled_at->toDateString()); + } + + public function test_admin_instances_page_shows_error_count_per_instance(): void + { + $first = Instance::factory() + ->type(InstanceType::Mastodon) + ->enabled() + ->create(['url' => 'https://aardvark.example']); + + $second = Instance::factory() + ->type(InstanceType::Lemmy) + ->enabled() + ->create(['url' => 'https://zebra.example']); + + // First instance: 3 failed + 2 non-failed pages + Page::factory() + ->count(3) + ->sequence(fn ($s) => ['url' => "https://aardvark.example/fail-{$s->index}"]) + ->createQuietly(['instance_id' => $first->id, 'status' => PageStatusEnum::Failed]); + + Page::factory() + ->count(2) + ->sequence(fn ($s) => ['url' => "https://aardvark.example/ok-{$s->index}"]) + ->createQuietly(['instance_id' => $first->id, 'status' => PageStatusEnum::Fetched]); + + // Second instance: 1 failed + 4 non-failed pages + Page::factory() + ->count(1) + ->sequence(fn ($s) => ['url' => "https://zebra.example/fail-{$s->index}"]) + ->createQuietly(['instance_id' => $second->id, 'status' => PageStatusEnum::Failed]); + + Page::factory() + ->count(4) + ->sequence(fn ($s) => ['url' => "https://zebra.example/ok-{$s->index}"]) + ->createQuietly(['instance_id' => $second->id, 'status' => PageStatusEnum::Fetched]); + + $response = $this->get('/admin/instances'); + + // Each error-count cell must render as "{n} errors" — this string cannot + // collide with dates, IDs, or the "URLs" column. The counts (3 and 1) + // are distinct and non-equal so the assertion proves per-row mapping, + // not a leaked total. + $response->assertSeeInOrder([ + $first->url, + '3 errors', + $second->url, + '1 errors', + ]); + } + + public function test_admin_instances_page_shows_url_count_per_instance(): void + { + $first = Instance::factory() + ->type(InstanceType::Mastodon) + ->enabled() + ->create(['url' => 'https://aardvark.example']); + + $second = Instance::factory() + ->type(InstanceType::Lemmy) + ->enabled() + ->create(['url' => 'https://zebra.example']); + + Page::factory() + ->count(7) + ->sequence(fn ($s) => ['url' => "https://aardvark.example/page-{$s->index}"]) + ->createQuietly(['instance_id' => $first->id]); + + Page::factory() + ->count(2) + ->sequence(fn ($s) => ['url' => "https://zebra.example/page-{$s->index}"]) + ->createQuietly(['instance_id' => $second->id]); + + $response = $this->get('/admin/instances'); + + // Each count cell must render as "{n} URLs" — this string cannot + // collide with dates, IDs, or any other incidental numeric content, + // so the assertion only passes when a real count column is wired in. + $response->assertSeeInOrder([ + $first->url, + '7 URLs', + $second->url, + '2 URLs', + ]); + } +} diff --git a/tests/Feature/BotPageTest.php b/tests/Feature/BotPageTest.php new file mode 100644 index 0000000..e544b00 --- /dev/null +++ b/tests/Feature/BotPageTest.php @@ -0,0 +1,39 @@ +get('/bot'); + + $response->assertStatus(200); + } + + public function test_bot_page_contains_user_agent_string(): void + { + $response = $this->get('/bot'); + + $response->assertSee('TroveBot/0.1 (+https://trove.lvl0.xyz/bot)', escape: false); + } + + public function test_bot_page_contains_robots_txt_opt_out_example(): void + { + $response = $this->get('/bot'); + + $response->assertSee('User-agent: TroveBot', escape: false); + $response->assertSee('Disallow: /', escape: false); + } + + public function test_bot_page_links_to_forge_repository(): void + { + $response = $this->get('/bot'); + + $response->assertSee('https://forge.lvl0.xyz/lvl0/trove', escape: false); + } +} diff --git a/tests/Feature/ExampleTest.php b/tests/Feature/ExampleTest.php deleted file mode 100644 index 8364a84..0000000 --- a/tests/Feature/ExampleTest.php +++ /dev/null @@ -1,19 +0,0 @@ -get('/'); - - $response->assertStatus(200); - } -} diff --git a/tests/Feature/Jobs/ProcessCrawlJobTest.php b/tests/Feature/Jobs/ProcessCrawlJobTest.php new file mode 100644 index 0000000..4f07f80 --- /dev/null +++ b/tests/Feature/Jobs/ProcessCrawlJobTest.php @@ -0,0 +1,573 @@ +createQuietly(['url' => 'https://example.com/article']); + PageCrawl::factory()->page($page)->create(); + + Queue::assertPushed(ProcessCrawlJob::class); + } + + public function test_dispatched_job_carries_the_correct_page_crawl(): void + { + Queue::fake(); + + $page = Page::factory()->createQuietly(['url' => 'https://example.com/article']); + $crawl = PageCrawl::factory()->page($page)->create(); + + Queue::assertPushed( + ProcessCrawlJob::class, + fn (ProcessCrawlJob $job) => $job->pageCrawl->id === $crawl->id, + ); + } + + public function test_handle_writes_outcome_to_page_crawl_on_success(): void + { + Queue::fake(); + + $this->mockFetchPageAction(CrawlOutcomeEnum::Success, statusCode: 200, title: 'Hello', extractedText: 'hi', wordCount: 1); + + $page = Page::factory()->createQuietly(['url' => 'https://example.com/article']); + $crawl = PageCrawl::factory()->page($page)->createQuietly(); + + app(ProcessCrawlJob::class, ['pageCrawl' => $crawl]) + ->handle(); + + $fresh = $crawl->fresh(); + $this->assertSame(CrawlOutcomeEnum::Success, $fresh->outcome); + $this->assertNotNull($fresh->completed_at); + $this->assertInstanceOf(Carbon::class, $fresh->completed_at); + $this->assertSame(200, $fresh->status_code); + $this->assertNull($fresh->error_message); + } + + public function test_handle_updates_page_to_fetched_on_success(): void + { + Queue::fake(); + + $this->mockFetchPageAction(CrawlOutcomeEnum::Success, statusCode: 200, title: 'Hello', extractedText: 'hi', wordCount: 1); + + $page = Page::factory()->createQuietly(['url' => 'https://example.com/article']); + $crawl = PageCrawl::factory()->page($page)->createQuietly(); + + app(ProcessCrawlJob::class, ['pageCrawl' => $crawl]) + ->handle(); + + $fresh = $page->fresh(); + $this->assertSame(PageStatusEnum::Fetched, $fresh->status); + $this->assertNotNull($fresh->fetched_at); + $this->assertInstanceOf(Carbon::class, $fresh->fetched_at); + $this->assertSame('Hello', $fresh->title); + } + + public function test_handle_updates_page_to_rejected_on_rejected_outcome(): void + { + Queue::fake(); + + $this->mockFetchPageAction(CrawlOutcomeEnum::Rejected, statusCode: 200, errorMessage: 'Unsupported Content-Type: application/pdf'); + + $page = Page::factory()->createQuietly(['url' => 'https://example.com/brochure.pdf']); + $crawl = PageCrawl::factory()->page($page)->createQuietly(); + + app(ProcessCrawlJob::class, ['pageCrawl' => $crawl]) + ->handle(); + + $fresh = $page->fresh(); + $this->assertSame(PageStatusEnum::Rejected, $fresh->status); + $this->assertNull($fresh->fetched_at); + } + + public function test_handle_updates_page_to_failed_on_blocked_4xx(): void + { + Queue::fake(); + + $this->mockFetchPageAction(CrawlOutcomeEnum::Blocked4xx, statusCode: 404, errorMessage: 'HTTP 404'); + + $page = Page::factory()->createQuietly(['url' => 'https://example.com/gone']); + $crawl = PageCrawl::factory()->page($page)->createQuietly(); + + app(ProcessCrawlJob::class, ['pageCrawl' => $crawl]) + ->handle(); + + $fresh = $page->fresh(); + $this->assertSame(PageStatusEnum::Failed, $fresh->status); + $this->assertNotNull($fresh->failed_at); + $this->assertInstanceOf(Carbon::class, $fresh->failed_at); + } + + public function test_handle_updates_page_to_failed_on_timeout(): void + { + Queue::fake(); + + $this->mockFetchPageAction(CrawlOutcomeEnum::Timeout, errorMessage: 'Connection timed out after 10 seconds'); + + $page = Page::factory()->createQuietly(['url' => 'https://example.com/slow']); + $crawl = PageCrawl::factory()->page($page)->createQuietly(); + + app(ProcessCrawlJob::class, ['pageCrawl' => $crawl]) + ->handle(); + + $fresh = $page->fresh(); + $this->assertSame(PageStatusEnum::Failed, $fresh->status); + $this->assertNotNull($fresh->failed_at); + $this->assertInstanceOf(Carbon::class, $fresh->failed_at); + } + + public function test_handle_schedules_retry_on_transient_failure(): void + { + Queue::fake(); + + $this->mockFetchPageAction(CrawlOutcomeEnum::Failed, errorMessage: 'Connection refused'); + + $page = Page::factory()->createQuietly(['url' => 'https://example.com/unstable']); + $crawl = PageCrawl::factory()->page($page)->createQuietly(); + + app(ProcessCrawlJob::class, ['pageCrawl' => $crawl]) + ->handle(); + + // A second PageCrawl row (the retry) must have been inserted for the same page + $this->assertSame(2, PageCrawl::where('page_id', $page->id)->count()); + + // The new row is pending — outcome IS NULL + $retryRow = PageCrawl::where('page_id', $page->id) + ->whereNull('outcome') + ->first(); + $this->assertNotNull($retryRow); + + // A delayed ProcessCrawlJob must have been pushed for the retry row + Queue::assertPushed( + ProcessCrawlJob::class, + fn (ProcessCrawlJob $job) => $job->pageCrawl->page_id === $page->id + && $job->pageCrawl->id === $retryRow->id, + ); + } + + public function test_handle_does_not_retry_after_three_attempts(): void + { + Queue::fake(); + + $this->mockFetchPageAction(CrawlOutcomeEnum::Failed, errorMessage: 'Connection refused'); + + $page = Page::factory()->createQuietly(['url' => 'https://example.com/unreachable']); + + // 3 prior attempts already exist — this is the cap + PageCrawl::factory()->page($page)->failed('Connection refused')->createQuietly(); + PageCrawl::factory()->page($page)->failed('Connection refused')->createQuietly(); + $thirdCrawl = PageCrawl::factory()->page($page)->createQuietly(); + + app(ProcessCrawlJob::class, ['pageCrawl' => $thirdCrawl]) + ->handle(); + + // No 4th row must appear — retry cap reached + $this->assertSame(3, PageCrawl::where('page_id', $page->id)->count()); + + // No retry job dispatched + Queue::assertNotPushed(ProcessCrawlJob::class); + } + + public function test_handle_writes_failed_outcome_to_page_crawl(): void + { + Queue::fake(); + + $this->mockFetchPageAction(CrawlOutcomeEnum::Failed, errorMessage: 'boom'); + + $page = Page::factory()->createQuietly(['url' => 'https://example.com/unstable']); + $crawl = PageCrawl::factory()->page($page)->createQuietly(); + + app(ProcessCrawlJob::class, ['pageCrawl' => $crawl]) + ->handle(); + + $this->assertDatabaseHas('page_crawls', [ + 'id' => $crawl->id, + 'outcome' => CrawlOutcomeEnum::Failed->value, + 'status_code' => null, + 'error_message' => 'boom', + ]); + } + + public function test_handle_updates_page_to_failed_on_failed_outcome(): void + { + Queue::fake(); + + $this->mockFetchPageAction(CrawlOutcomeEnum::Failed, errorMessage: 'Connection refused'); + + $page = Page::factory()->createQuietly(['url' => 'https://example.com/unreachable']); + $crawl = PageCrawl::factory()->page($page)->createQuietly(); + + app(ProcessCrawlJob::class, ['pageCrawl' => $crawl]) + ->handle(); + + $this->assertSame(PageStatusEnum::Failed, $page->fresh()->status); + } + + public function test_handle_updates_page_to_failed_on_blocked_5xx(): void + { + Queue::fake(); + + $this->mockFetchPageAction(CrawlOutcomeEnum::Blocked5xx, statusCode: 503, errorMessage: 'HTTP 503'); + + $page = Page::factory()->createQuietly(['url' => 'https://example.com/overloaded']); + $crawl = PageCrawl::factory()->page($page)->createQuietly(); + + app(ProcessCrawlJob::class, ['pageCrawl' => $crawl]) + ->handle(); + + $this->assertSame(PageStatusEnum::Failed, $page->fresh()->status); + } + + public function test_handle_updates_page_to_failed_on_blocked_robots(): void + { + Queue::fake(); + + $this->mockFetchPageAction(CrawlOutcomeEnum::BlockedRobots, errorMessage: 'Disallowed by robots.txt'); + + $page = Page::factory()->createQuietly(['url' => 'https://example.com/private']); + $crawl = PageCrawl::factory()->page($page)->createQuietly(); + + app(ProcessCrawlJob::class, ['pageCrawl' => $crawl]) + ->handle(); + + $this->assertSame(PageStatusEnum::Failed, $page->fresh()->status); + } + + public function test_handle_does_not_register_outbound_links_on_failure(): void + { + Queue::fake(); + + $this->mockFetchPageAction( + CrawlOutcomeEnum::Failed, + outboundLinks: collect(['https://should-not-be-registered.com/page']), + errorMessage: 'Connection refused', + ); + + $page = Page::factory()->createQuietly(['url' => 'https://example.com/broken']); + $crawl = PageCrawl::factory()->page($page)->createQuietly(); + + app(ProcessCrawlJob::class, ['pageCrawl' => $crawl]) + ->handle(); + + $this->assertDatabaseMissing('pages', ['url' => 'https://should-not-be-registered.com/page']); + $this->assertSame(1, Page::count()); + } + + public function test_handle_registers_outbound_links_on_success(): void + { + Queue::fake(); + + $this->mockFetchPageAction( + CrawlOutcomeEnum::Success, + statusCode: 200, + finalUrl: 'https://source.com/article', + title: 'Source Article', + extractedText: 'some text', + outboundLinks: collect(['https://other.com/article-1', 'https://another.com/post-2']), + wordCount: 2, + ); + + $page = Page::factory()->createQuietly(['url' => 'https://source.com/article']); + $crawl = PageCrawl::factory()->page($page)->createQuietly(); + + app(ProcessCrawlJob::class, ['pageCrawl' => $crawl]) + ->handle(); + + $this->assertDatabaseHas('pages', ['url' => 'https://other.com/article-1']); + $this->assertDatabaseHas('pages', ['url' => 'https://another.com/post-2']); + $this->assertSame(3, Page::count()); + } + + public function test_handle_releases_job_when_domain_is_locked(): void + { + Queue::fake(); + + // Pre-acquire the lock so the job sees it as already held + Cache::lock('crawler:domain:example.com', 10)->get(); + + // The fetcher must NOT be called — the job should bail before reaching it + $fetcher = Mockery::mock(FetchPageAction::class); + $fetcher->shouldNotReceive('__invoke'); + $this->app->instance(FetchPageAction::class, $fetcher); + + $page = Page::factory()->createQuietly(['url' => 'https://example.com/article']); + $crawl = PageCrawl::factory()->page($page)->createQuietly(); + + $job = new ProcessCrawlJob($crawl); + $job->handle(); + + // No outcome written — handle() returned early + $this->assertNull($crawl->fresh()->outcome); + + // Page status unchanged from its factory default (Discovered) + $this->assertSame(PageStatusEnum::Discovered, $page->fresh()->status); + } + + public function test_handle_does_not_release_lock_after_completion(): void + { + Queue::fake(); + + $this->mockFetchPageAction(CrawlOutcomeEnum::Success, statusCode: 200); + + $page = Page::factory()->createQuietly(['url' => 'https://example.com/article']); + $crawl = PageCrawl::factory()->page($page)->createQuietly(); + + $job = new ProcessCrawlJob($crawl); + $job->handle(); + + // If handle() called $lock->release(), this second get() would succeed (true). + // It must fail (false) — the lock acquired inside handle() must still be held. + $result = Cache::lock('crawler:domain:example.com', 10)->get(); + $this->assertFalse($result, 'Expected the domain lock to still be held after handle() completed, but it was free — the lock was released prematurely.'); + } + + public function test_handle_writes_blocked_robots_when_disallowed(): void + { + Queue::fake(); + + Http::fake([ + 'https://example.com/robots.txt' => Http::response( + "User-agent: *\nDisallow: /", + 200, + ), + ]); + + // FetchPageAction must never be called — the robots gate returns before the lock + $fetcher = Mockery::mock(FetchPageAction::class); + $fetcher->shouldNotReceive('__invoke'); + $this->app->instance(FetchPageAction::class, $fetcher); + + $page = Page::factory()->createQuietly(['url' => 'https://example.com/private']); + $crawl = PageCrawl::factory()->page($page)->createQuietly(); + + $domain = $crawl->domain; + + app(ProcessCrawlJob::class, ['pageCrawl' => $crawl]) + ->handle(); + + // Outcome row must record BlockedRobots + $this->assertDatabaseHas('page_crawls', [ + 'id' => $crawl->id, + 'outcome' => CrawlOutcomeEnum::BlockedRobots->value, + ]); + + // Page status must be Failed (BlockedRobots::toPageStatus() === Failed) + $this->assertSame(PageStatusEnum::Failed, $page->fresh()->status); + + // The politeness lock must still be acquirable — the gate returned before ever claiming it + $this->assertTrue( + Cache::lock("crawler:domain:{$domain}", 10)->get(), + 'Expected the politeness lock to be free (gate returned before acquiring it), but it was already held.', + ); + } + + public function test_handle_acquires_domain_lock_before_fetching(): void + { + Queue::fake(); + + $this->mockFetchPageAction(CrawlOutcomeEnum::Success, statusCode: 200); + + $page = Page::factory()->createQuietly(['url' => 'https://lock-test.example.com/article']); + $crawl = PageCrawl::factory()->page($page)->createQuietly(); + + $domain = $crawl->domain; + + app(ProcessCrawlJob::class, ['pageCrawl' => $crawl]) + ->handle(); + + // The lock must still be held after handle() completes — a second attempt to acquire it fails + $this->assertFalse( + Cache::lock("crawler:domain:{$domain}", 10)->get(), + 'Expected the domain lock to still be held after handle() ran, but it was free.', + ); + + // The fetch ran — outcome was written (proves the lock did not block execution) + $this->assertSame(CrawlOutcomeEnum::Success, $crawl->fresh()->outcome); + } + + public function test_handle_proceeds_through_politeness_lock_when_robots_allow(): void + { + Queue::fake(); + + Http::fake([ + 'https://example.com/robots.txt' => Http::response( + "User-agent: *\nAllow: /", + 200, + ), + ]); + + // FetchPageAction must be called exactly once — robots gate passed, fetch proceeds + $fetcher = Mockery::mock(FetchPageAction::class); + $fetcher->shouldReceive('__invoke')->once()->andReturn(new FetchResult( + outcome: CrawlOutcomeEnum::Success, + statusCode: 200, + finalUrl: 'https://example.com/article', + title: 'Hello', + extractedText: 'hi', + outboundLinks: collect(), + wordCount: 1, + errorMessage: null, + )); + $this->app->instance(FetchPageAction::class, $fetcher); + + $page = Page::factory()->createQuietly(['url' => 'https://example.com/article']); + $crawl = PageCrawl::factory()->page($page)->createQuietly(); + + $domain = $crawl->domain; + + app(ProcessCrawlJob::class, ['pageCrawl' => $crawl]) + ->handle(); + + // Outcome must be Success — not BlockedRobots + $this->assertDatabaseHas('page_crawls', [ + 'id' => $crawl->id, + 'outcome' => CrawlOutcomeEnum::Success->value, + ]); + + // Page status must have advanced to Fetched + $this->assertSame(PageStatusEnum::Fetched, $page->fresh()->status); + + // Politeness lock must still be held (claimed during the fetch, never released) + $this->assertFalse( + Cache::lock("crawler:domain:{$domain}", 10)->get(), + 'Expected the politeness lock to be held after a successful fetch, but it was free.', + ); + } + + public function test_handle_persists_language_on_success(): void + { + Queue::fake(); + + $this->mockFetchPageAction( + CrawlOutcomeEnum::Success, + statusCode: 200, + title: 'Hello', + extractedText: 'hi', + wordCount: 1, + language: 'en', + languageConfidence: 0.95, + ); + + $page = Page::factory()->createQuietly(['url' => 'https://example.com/article']); + $crawl = PageCrawl::factory()->page($page)->createQuietly(); + + app(ProcessCrawlJob::class, ['pageCrawl' => $crawl]) + ->handle(); + + $fresh = $page->fresh(); + $this->assertSame('en', $fresh->language); + $this->assertEqualsWithDelta(0.95, $fresh->language_confidence, 0.001); + } + + public function test_handle_does_not_overwrite_existing_language_when_new_fetch_returns_null(): void + { + Queue::fake(); + + $this->mockFetchPageAction( + CrawlOutcomeEnum::Success, + statusCode: 200, + title: 'Hello', + extractedText: 'hi', + wordCount: 1, + language: null, + languageConfidence: null, + ); + + // Page already has a language from a previous fetch + $page = Page::factory()->createQuietly([ + 'url' => 'https://example.com/article', + 'language' => 'en', + 'language_confidence' => 0.95, + ]); + $crawl = PageCrawl::factory()->page($page)->createQuietly(); + + app(ProcessCrawlJob::class, ['pageCrawl' => $crawl]) + ->handle(); + + $fresh = $page->fresh(); + + // Language columns must be sticky — null detection must NOT overwrite them + $this->assertSame('en', $fresh->language); + $this->assertEqualsWithDelta(0.95, $fresh->language_confidence, 0.001); + + // Other columns must still update — sticky applies to language only + $this->assertSame(PageStatusEnum::Fetched, $fresh->status); + $this->assertSame('Hello', $fresh->title); + } + + public function test_handle_leaves_language_null_when_no_prior_and_no_detection(): void + { + Queue::fake(); + + $this->mockFetchPageAction( + CrawlOutcomeEnum::Success, + statusCode: 200, + title: 'Hello', + extractedText: 'hi', + wordCount: 1, + language: null, + languageConfidence: null, + ); + + $page = Page::factory()->createQuietly(['url' => 'https://example.com/article']); + $crawl = PageCrawl::factory()->page($page)->createQuietly(); + + app(ProcessCrawlJob::class, ['pageCrawl' => $crawl]) + ->handle(); + + $fresh = $page->fresh(); + $this->assertNull($fresh->language); + $this->assertNull($fresh->language_confidence); + } + + private function mockFetchPageAction( + CrawlOutcomeEnum $outcome, + ?int $statusCode = null, + ?string $finalUrl = 'https://example.com/article', + ?string $title = null, + ?string $extractedText = null, + ?Collection $outboundLinks = null, + ?int $wordCount = null, + ?string $errorMessage = null, + ?string $language = null, + ?float $languageConfidence = null, + ): void { + $fetcher = Mockery::mock(FetchPageAction::class); + $fetcher->shouldReceive('__invoke')->andReturn(new FetchResult( + outcome: $outcome, + statusCode: $statusCode, + finalUrl: $finalUrl, + title: $title, + extractedText: $extractedText, + outboundLinks: $outboundLinks ?? collect(), + wordCount: $wordCount, + errorMessage: $errorMessage, + language: $language, + languageConfidence: $languageConfidence, + )); + $this->app->instance(FetchPageAction::class, $fetcher); + } +} diff --git a/tests/Feature/Listeners/PollFailedListenerTest.php b/tests/Feature/Listeners/PollFailedListenerTest.php new file mode 100644 index 0000000..abd706b --- /dev/null +++ b/tests/Feature/Listeners/PollFailedListenerTest.php @@ -0,0 +1,52 @@ +type(InstanceType::Mastodon) + ->enabled() + ->create(['consecutive_poll_failures' => 0]); + + $message = 'connection timed out'; + $failedAt = CarbonImmutable::now(); + $event = new PollFailed($instance, $message, $failedAt); + + $service = Mockery::mock(PollAlertService::class); + $service->shouldReceive('recordFailure') + ->once() + ->with( + Mockery::on(fn (Instance $i) => $i->is($instance)), + $message, + ); + + $listener = new PollFailedListener($service); + $listener->handle($event); + } + + public function test_listener_is_not_queued(): void + { + $this->assertNotInstanceOf( + ShouldQueue::class, + new PollFailedListener($this->createStub(PollAlertService::class)), + ); + } +} diff --git a/tests/Feature/PageQueuePopulationTest.php b/tests/Feature/PageQueuePopulationTest.php new file mode 100644 index 0000000..943d79c --- /dev/null +++ b/tests/Feature/PageQueuePopulationTest.php @@ -0,0 +1,70 @@ +create(['url' => $url]); + + $expectedDomain = (new UrlService)->host($url); + + $this->assertDatabaseHas('page_crawls', [ + 'page_id' => $page->id, + 'domain' => $expectedDomain, + 'priority' => 0, + ]); + + $crawl = PageCrawl::where('page_id', $page->id)->first(); + $this->assertNotNull($crawl); + } + + public function test_first_or_create_with_existing_url_does_not_insert_duplicate_crawl(): void + { + $url = 'https://example-blog.com/article'; + + Page::factory()->create(['url' => $url]); + + // Finds the existing row — created event does not fire again + Page::firstOrCreate(['url' => $url], ['status' => 'discovered']); + + $this->assertDatabaseCount('page_crawls', 1); + } + + public function test_updating_a_page_does_not_insert_another_crawl(): void + { + $page = Page::factory()->create(['url' => 'https://example-blog.com/article']); + + $page->update(['title' => 'New Title']); + + $this->assertDatabaseCount('page_crawls', 1); + } + + public function test_bad_url_throws_exception_page_persists_no_crawl_inserted(): void + { + $caught = null; + + try { + Page::create(['url' => 'not-a-url', 'status' => 'discovered']); + } catch (\InvalidArgumentException $e) { + $caught = $e; + } + + $this->assertNotNull($caught, 'Expected InvalidArgumentException to be thrown'); + $this->assertDatabaseHas('pages', ['url' => 'not-a-url']); + $this->assertDatabaseCount('page_crawls', 0); + } +} diff --git a/tests/Feature/PollFailedIntegrationTest.php b/tests/Feature/PollFailedIntegrationTest.php new file mode 100644 index 0000000..96dd973 --- /dev/null +++ b/tests/Feature/PollFailedIntegrationTest.php @@ -0,0 +1,37 @@ +type(InstanceType::Mastodon) + ->enabled() + ->create(['consecutive_poll_failures' => 0]); + + $this->mock(PollFediverseAction::class) + ->shouldReceive('execute') + ->once() + ->andThrow(new RuntimeException('connection refused')); + + $this->artisan('fedi-discover:poll'); + + $this->assertSame(1, $instance->fresh()->consecutive_poll_failures); + } +} diff --git a/tests/Feature/Services/PollAlertServiceTest.php b/tests/Feature/Services/PollAlertServiceTest.php new file mode 100644 index 0000000..714f359 --- /dev/null +++ b/tests/Feature/Services/PollAlertServiceTest.php @@ -0,0 +1,171 @@ +type(InstanceType::Mastodon) + ->enabled() + ->create(['consecutive_poll_failures' => 0]); + + $service = new PollAlertService; + $service->recordFailure($instance, 'test'); + + $this->assertDatabaseHas('fedi_discover_instances', [ + 'id' => $instance->id, + 'consecutive_poll_failures' => 1, + ]); + } + + public function test_no_alert_sent_below_threshold(): void + { + Http::fake(); + + config([ + 'services.ntfy.url' => 'https://ntfy.example.com', + 'services.ntfy.topic' => 'trove-alerts', + 'services.ntfy.threshold' => 3, + ]); + + $instance = Instance::factory() + ->type(InstanceType::Mastodon) + ->enabled() + ->create(['consecutive_poll_failures' => 1]); // will become 2 after recordFailure + + $service = new PollAlertService; + $service->recordFailure($instance, 'test'); + + Http::assertNothingSent(); + } + + public function test_alert_sent_when_threshold_is_reached(): void + { + Http::fake(); + + config([ + 'services.ntfy.url' => 'https://ntfy.example.com', + 'services.ntfy.topic' => 'trove-alerts', + 'services.ntfy.threshold' => 3, + ]); + + $instance = Instance::factory() + ->type(InstanceType::Mastodon) + ->enabled() + ->create(['consecutive_poll_failures' => 2]); // will become 3 after recordFailure = exactly at threshold + + $service = new PollAlertService; + $service->recordFailure($instance, 'test'); + + Http::assertSent(function ($request) { + return $request->url() === 'https://ntfy.example.com/trove-alerts' + && $request->method() === 'POST'; + }); + } + + public function test_alert_sent_when_count_exceeds_threshold(): void + { + Http::fake(); + + config([ + 'services.ntfy.url' => 'https://ntfy.example.com', + 'services.ntfy.topic' => 'trove-alerts', + 'services.ntfy.threshold' => 3, + ]); + + $instance = Instance::factory() + ->type(InstanceType::Mastodon) + ->enabled() + ->create(['consecutive_poll_failures' => 3]); // will become 4 after recordFailure = above threshold + + $service = new PollAlertService; + $service->recordFailure($instance, 'test'); + + Http::assertSent(function ($request) { + return $request->url() === 'https://ntfy.example.com/trove-alerts' + && $request->method() === 'POST'; + }); + } + + public function test_no_alert_sent_when_threshold_is_zero(): void + { + Http::fake(); + + config([ + 'services.ntfy.url' => 'https://ntfy.example.com', + 'services.ntfy.topic' => 'trove-alerts', + 'services.ntfy.threshold' => 0, + ]); + + $instance = Instance::factory() + ->type(InstanceType::Mastodon) + ->enabled() + ->create(['consecutive_poll_failures' => 5]); + + $service = new PollAlertService; + $service->recordFailure($instance, 'test'); + + Http::assertNothingSent(); + } + + public function test_no_alert_sent_when_topic_is_null(): void + { + Http::fake(); + + config([ + 'services.ntfy.url' => 'https://ntfy.example.com', + 'services.ntfy.topic' => null, + 'services.ntfy.threshold' => 3, + ]); + + $instance = Instance::factory() + ->type(InstanceType::Mastodon) + ->enabled() + ->create(['consecutive_poll_failures' => 2]); // will become 3 after recordFailure = at threshold + + $service = new PollAlertService; + $service->recordFailure($instance, 'test'); + + Http::assertNothingSent(); + } + + public function test_alert_body_contains_instance_url_and_message(): void + { + Http::fake(); + + config([ + 'services.ntfy.url' => 'https://ntfy.example.com', + 'services.ntfy.topic' => 'trove-alerts', + 'services.ntfy.threshold' => 3, + ]); + + $instance = Instance::factory() + ->type(InstanceType::Mastodon) + ->enabled() + ->create([ + 'url' => 'https://mastodon.social', + 'consecutive_poll_failures' => 2, // will become 3 = at threshold + ]); + + $service = new PollAlertService; + $service->recordFailure($instance, 'connection refused after 3 retries'); + + Http::assertSent(function ($request) { + return str_contains($request->body(), 'https://mastodon.social') + && str_contains($request->body(), 'connection refused after 3 retries'); + }); + } +} diff --git a/tests/Feature/UrlDiscoveryTest.php b/tests/Feature/UrlDiscoveryTest.php new file mode 100644 index 0000000..c616132 --- /dev/null +++ b/tests/Feature/UrlDiscoveryTest.php @@ -0,0 +1,155 @@ +type(InstanceType::Mastodon) + ->enabled() + ->create(); + } + + private function makeEvent(Instance $instance, array $overrides = []): UrlDiscovered + { + return new UrlDiscovered( + url: $overrides['url'] ?? 'https://example-blog.com/article', + instanceId: $overrides['instanceId'] ?? $instance->id, + discoveredAt: $overrides['discoveredAt'] ?? CarbonImmutable::parse('2026-04-26T12:00:00Z'), + postUrl: array_key_exists('postUrl', $overrides) ? $overrides['postUrl'] : 'https://mastodon.social/@alice/109876543210', + postBody: array_key_exists('postBody', $overrides) ? $overrides['postBody'] : 'check this out https://example-blog.com/article', + ); + } + + // --------------------------------------------------------------------------- + // Test 9 — happy path + // --------------------------------------------------------------------------- + + public function test_listener_creates_target_page_and_source_page_with_link(): void + { + $instance = $this->makeInstance(); + $discoveredAt = CarbonImmutable::parse('2026-04-26T12:00:00Z'); + + $event = new UrlDiscovered( + url: 'https://example-blog.com/article', + instanceId: $instance->id, + discoveredAt: $discoveredAt, + postUrl: 'https://mastodon.social/@alice/109876543210', + postBody: 'check this out https://example-blog.com/article', + ); + + event($event); + + // Target page + $targetPage = Page::where('url', 'https://example-blog.com/article')->first(); + $this->assertNotNull($targetPage); + + // Source page + $sourcePage = Page::where('url', 'https://mastodon.social/@alice/109876543210')->first(); + $this->assertNotNull($sourcePage); + + // Edge + $link = PageLink::where('source_page_id', $sourcePage->id) + ->where('target_page_id', $targetPage->id) + ->first(); + $this->assertNotNull($link); + } + + // --------------------------------------------------------------------------- + // Test 10 — idempotency + // --------------------------------------------------------------------------- + + public function test_listener_is_idempotent_on_repeated_event(): void + { + $instance = $this->makeInstance(); + $event = $this->makeEvent($instance); + + event($event); + event($event); + + $this->assertSame(2, Page::count()); + $this->assertSame(1, PageLink::count()); + } + + // --------------------------------------------------------------------------- + // Test 11 — null postUrl: only target page, no edge + // --------------------------------------------------------------------------- + + public function test_listener_with_null_post_url_creates_only_target_page(): void + { + $instance = $this->makeInstance(); + $event = $this->makeEvent($instance, ['postUrl' => null, 'postBody' => null]); + + event($event); + + $this->assertSame(1, Page::count()); + $this->assertSame(0, PageLink::count()); + + $targetPage = Page::where('url', 'https://example-blog.com/article')->first(); + $this->assertNotNull($targetPage); + } + + // --------------------------------------------------------------------------- + // Integration — UrlDiscovered event enqueues crawls for both pages via observer + // --------------------------------------------------------------------------- + + public function test_url_discovered_event_enqueues_crawls_via_observer(): void + { + $instance = $this->makeInstance(); + + $event = new UrlDiscovered( + url: 'https://example-blog.com/article', + instanceId: $instance->id, + discoveredAt: CarbonImmutable::parse('2026-04-26T12:00:00Z'), + postUrl: 'https://mastodon.social/@alice/109876543210', + postBody: 'check this out https://example-blog.com/article', + ); + + event($event); + + // Listener creates 2 pages (target + source); observer fires for each → 2 crawl rows + $this->assertDatabaseCount('page_crawls', 2); + $this->assertDatabaseHas('page_crawls', ['domain' => 'example-blog.com']); + $this->assertDatabaseHas('page_crawls', ['domain' => 'mastodon.social']); + } + + // --------------------------------------------------------------------------- + // Test 12 — listener is queued, not run inline + // --------------------------------------------------------------------------- + + public function test_listener_is_pushed_to_queue_not_run_inline(): void + { + Queue::fake(); + + $instance = $this->makeInstance(); + $event = $this->makeEvent($instance); + + event($event); + + Queue::assertPushed(CallQueuedListener::class, function (CallQueuedListener $job): bool { + return $job->class === UrlDiscoveredListener::class; + }); + } +} diff --git a/tests/Feature/UrlSubmissionTest.php b/tests/Feature/UrlSubmissionTest.php new file mode 100644 index 0000000..ff85245 --- /dev/null +++ b/tests/Feature/UrlSubmissionTest.php @@ -0,0 +1,158 @@ +get('/submit'); + + $response->assertStatus(200); + $response->assertSeeLivewire('url-submission-form'); + } + + // ------------------------------------------------------------------------- + // Test 2 — valid submission creates a page row as Discovered + // ------------------------------------------------------------------------- + + public function test_valid_url_submission_creates_page_as_discovered(): void + { + Livewire::test(UrlSubmissionForm::class) + ->set('url', 'https://example.com/interesting-post') + ->call('submit') + ->assertHasNoErrors(); + + $this->assertDatabaseHas('pages', [ + 'url' => 'https://example.com/interesting-post', + ]); + } + + // ------------------------------------------------------------------------- + // Test 3 — duplicate submission is idempotent (no second row created) + // ------------------------------------------------------------------------- + + public function test_duplicate_url_submission_does_not_create_second_page(): void + { + $url = 'https://example.com/seen-before'; + + Page::factory()->create([ + 'url' => $url, + 'status' => PageStatusEnum::Discovered, + ]); + + Livewire::test(UrlSubmissionForm::class) + ->set('url', $url) + ->call('submit') + ->assertHasNoErrors(); + + $this->assertDatabaseCount('pages', 1); + } + + // ------------------------------------------------------------------------- + // Test 4 — confirmation state echoes submitted URL + // ------------------------------------------------------------------------- + + public function test_confirmation_state_echoes_submitted_url(): void + { + $url = 'https://example.com/great-article'; + + Livewire::test(UrlSubmissionForm::class) + ->set('url', $url) + ->call('submit') + ->assertHasNoErrors() + ->assertSet('confirmedUrl', $url) + ->assertSet('url', '') + ->assertSee($url); + } + + // ------------------------------------------------------------------------- + // Test 5 — empty URL fails validation (regression lock) + // ------------------------------------------------------------------------- + + public function test_missing_url_fails_validation(): void + { + Livewire::test(UrlSubmissionForm::class) + ->set('url', '') + ->call('submit') + ->assertHasErrors(['url' => 'required']); + } + + // ------------------------------------------------------------------------- + // Test 6 — invalid URL formats fail validation + // ------------------------------------------------------------------------- + + #[DataProvider('invalidUrls')] + public function test_invalid_url_formats_fail_validation(string $url): void + { + Livewire::test(UrlSubmissionForm::class) + ->set('url', $url) + ->call('submit') + ->assertHasErrors('url'); + } + + public static function invalidUrls(): array + { + return [ + 'no scheme' => ['not-a-url'], + 'disallowed scheme' => ['ftp://example.com'], + 'javascript scheme' => ['javascript:alert(1)'], + ]; + } + + // ------------------------------------------------------------------------- + // Integration — form submission enqueues a crawl via PageObserver + // ------------------------------------------------------------------------- + + public function test_url_submission_form_enqueues_crawl_via_observer(): void + { + Livewire::test(UrlSubmissionForm::class) + ->set('url', 'https://example.com/article') + ->call('submit') + ->assertHasNoErrors(); + + $this->assertDatabaseCount('page_crawls', 1); + $this->assertDatabaseHas('page_crawls', ['domain' => 'example.com']); + } + + // ------------------------------------------------------------------------- + // Test 7 — rate limit blocks the 11th submission within a minute + // ------------------------------------------------------------------------- + + public function test_rate_limit_blocks_eleventh_submission_within_a_minute(): void + { + // 10 submissions within the limit — each must succeed + for ($i = 1; $i <= 10; $i++) { + Livewire::test(UrlSubmissionForm::class) + ->set('url', "https://example.com/post-{$i}") + ->call('submit') + ->assertHasNoErrors(); + } + + // 11th submission from the same IP must be blocked, with the message visible + Livewire::test(UrlSubmissionForm::class) + ->set('url', 'https://example.com/post-11') + ->call('submit') + ->assertHasErrors('rate_limit') + ->assertSee('Too many submissions'); + + // The 11th URL must NOT have been persisted + $this->assertDatabaseCount('pages', 10); + } +} diff --git a/tests/Unit/Actions/RegisterDiscoveredPageActionTest.php b/tests/Unit/Actions/RegisterDiscoveredPageActionTest.php new file mode 100644 index 0000000..f993940 --- /dev/null +++ b/tests/Unit/Actions/RegisterDiscoveredPageActionTest.php @@ -0,0 +1,83 @@ +assertInstanceOf(Page::class, $page); + $this->assertSame('https://example.com/article', $page->url); + $this->assertSame(PageStatusEnum::Discovered, $page->status); + $this->assertNull($page->instance_id); + $this->assertDatabaseHas('pages', ['url' => 'https://example.com/article']); + } + + public function test_creates_page_with_provided_instance_id(): void + { + $instance = Instance::factory() + ->type(InstanceType::Mastodon) + ->enabled() + ->create(); + + $action = new RegisterDiscoveredPageAction; + + $page = $action('https://example.com/fediverse-post', instanceId: $instance->id); + + $this->assertInstanceOf(Page::class, $page); + $this->assertSame($instance->id, $page->instance_id); + $this->assertDatabaseHas('pages', [ + 'url' => 'https://example.com/fediverse-post', + 'instance_id' => $instance->id, + ]); + } + + public function test_returns_existing_page_when_url_already_exists(): void + { + $existing = Page::factory()->createQuietly([ + 'url' => 'https://example.com/seen-before', + 'status' => PageStatusEnum::Discovered, + ]); + + $action = new RegisterDiscoveredPageAction; + + $returned = $action('https://example.com/seen-before'); + + $this->assertSame($existing->id, $returned->id); + $this->assertDatabaseCount('pages', 1); + } + + public function test_existing_page_status_not_overwritten_on_duplicate_call(): void + { + Page::factory()->createQuietly([ + 'url' => 'https://example.com/already-fetched', + 'status' => PageStatusEnum::Fetched, + ]); + + $action = new RegisterDiscoveredPageAction; + + $returned = $action('https://example.com/already-fetched'); + + $this->assertSame(PageStatusEnum::Fetched, $returned->status); + $this->assertDatabaseHas('pages', [ + 'url' => 'https://example.com/already-fetched', + 'status' => PageStatusEnum::Fetched, + ]); + } +} diff --git a/tests/Unit/Enums/CrawlOutcomeEnumTest.php b/tests/Unit/Enums/CrawlOutcomeEnumTest.php new file mode 100644 index 0000000..17b214d --- /dev/null +++ b/tests/Unit/Enums/CrawlOutcomeEnumTest.php @@ -0,0 +1,75 @@ + 'success', + 'Failed' => 'failed', + 'Timeout' => 'timeout', + 'BlockedRobots' => 'blocked_robots', + 'Blocked4xx' => 'blocked_4xx', + 'Blocked5xx' => 'blocked_5xx', + 'Rejected' => 'rejected', + ]; + + foreach ($expected as $caseName => $backingValue) { + $case = CrawlOutcomeEnum::from($backingValue); + + $this->assertSame($caseName, $case->name, "Case name for '{$backingValue}' should be '{$caseName}'"); + $this->assertSame($backingValue, $case->value, "Backing value for '{$caseName}' should be '{$backingValue}'"); + } + } + + public function test_enum_has_exactly_seven_cases(): void + { + $this->assertCount(7, CrawlOutcomeEnum::cases()); + } + + public function test_to_page_status_maps_each_outcome_correctly(): void + { + $this->assertSame(PageStatusEnum::Fetched, CrawlOutcomeEnum::Success->toPageStatus()); + $this->assertSame(PageStatusEnum::Rejected, CrawlOutcomeEnum::Rejected->toPageStatus()); + $this->assertSame(PageStatusEnum::Failed, CrawlOutcomeEnum::Failed->toPageStatus()); + $this->assertSame(PageStatusEnum::Failed, CrawlOutcomeEnum::Timeout->toPageStatus()); + $this->assertSame(PageStatusEnum::Failed, CrawlOutcomeEnum::Blocked4xx->toPageStatus()); + $this->assertSame(PageStatusEnum::Failed, CrawlOutcomeEnum::Blocked5xx->toPageStatus()); + $this->assertSame(PageStatusEnum::Failed, CrawlOutcomeEnum::BlockedRobots->toPageStatus()); + } + + public function test_is_retryable_returns_true_only_for_transient_failures(): void + { + // Retryable: transient network/server problems that may resolve later + $this->assertTrue(CrawlOutcomeEnum::Failed->isRetryable()); + $this->assertTrue(CrawlOutcomeEnum::Timeout->isRetryable()); + $this->assertTrue(CrawlOutcomeEnum::Blocked5xx->isRetryable()); + + // Not retryable: success (done), permanent failures, or policy decisions + $this->assertFalse(CrawlOutcomeEnum::Success->isRetryable()); + $this->assertFalse(CrawlOutcomeEnum::Rejected->isRetryable()); + $this->assertFalse(CrawlOutcomeEnum::BlockedRobots->isRetryable()); + $this->assertFalse(CrawlOutcomeEnum::Blocked4xx->isRetryable()); + } + + public function test_should_register_outbound_links_returns_true_only_for_success(): void + { + $this->assertTrue(CrawlOutcomeEnum::Success->shouldRegisterOutboundLinks()); + + // No links to register on any non-Success outcome + $this->assertFalse(CrawlOutcomeEnum::Failed->shouldRegisterOutboundLinks()); + $this->assertFalse(CrawlOutcomeEnum::Timeout->shouldRegisterOutboundLinks()); + $this->assertFalse(CrawlOutcomeEnum::Rejected->shouldRegisterOutboundLinks()); + $this->assertFalse(CrawlOutcomeEnum::BlockedRobots->shouldRegisterOutboundLinks()); + $this->assertFalse(CrawlOutcomeEnum::Blocked4xx->shouldRegisterOutboundLinks()); + $this->assertFalse(CrawlOutcomeEnum::Blocked5xx->shouldRegisterOutboundLinks()); + } +} diff --git a/tests/Unit/Enums/PageStatusEnumTest.php b/tests/Unit/Enums/PageStatusEnumTest.php new file mode 100644 index 0000000..63d240d --- /dev/null +++ b/tests/Unit/Enums/PageStatusEnumTest.php @@ -0,0 +1,33 @@ + 'discovered', + 'Fetched' => 'fetched', + 'Failed' => 'failed', + 'Rejected' => 'rejected', + ]; + + foreach ($expected as $caseName => $backingValue) { + $case = PageStatusEnum::from($backingValue); + + $this->assertSame($caseName, $case->name, "Case name for '{$backingValue}' should be '{$caseName}'"); + $this->assertSame($backingValue, $case->value, "Backing value for '{$caseName}' should be '{$backingValue}'"); + } + } + + public function test_enum_has_exactly_four_cases(): void + { + $this->assertCount(4, PageStatusEnum::cases()); + } +} diff --git a/tests/Unit/ExampleTest.php b/tests/Unit/ExampleTest.php deleted file mode 100644 index 5773b0c..0000000 --- a/tests/Unit/ExampleTest.php +++ /dev/null @@ -1,16 +0,0 @@ -assertTrue(true); - } -} diff --git a/tests/Unit/Models/PageCrawlFactoryTest.php b/tests/Unit/Models/PageCrawlFactoryTest.php new file mode 100644 index 0000000..df9c02f --- /dev/null +++ b/tests/Unit/Models/PageCrawlFactoryTest.php @@ -0,0 +1,42 @@ +create(); + $crawl = PageCrawl::factory()->page($page)->successful()->create(); + + $this->assertSame(CrawlOutcomeEnum::Success, $crawl->outcome); + $this->assertInstanceOf(Carbon::class, $crawl->completed_at); + $this->assertNull($crawl->error_message); + } + + public function test_factory_failed_state_produces_failed_outcome_with_message(): void + { + Queue::fake(); + + $page = Page::factory()->create(); + $crawl = PageCrawl::factory()->page($page)->failed('Connection timed out')->create(); + + $this->assertSame(CrawlOutcomeEnum::Failed, $crawl->outcome); + $this->assertInstanceOf(Carbon::class, $crawl->completed_at); + $this->assertSame('Connection timed out', $crawl->error_message); + } +} diff --git a/tests/Unit/Models/PageCrawlTest.php b/tests/Unit/Models/PageCrawlTest.php new file mode 100644 index 0000000..08f8a59 --- /dev/null +++ b/tests/Unit/Models/PageCrawlTest.php @@ -0,0 +1,111 @@ +createQuietly(['url' => 'https://example.com/page-1']); + + $completedAt = Carbon::parse('2026-05-01 10:01:05'); + + $crawl = PageCrawl::create([ + 'page_id' => $page->id, + 'domain' => 'example.com', + 'priority' => 5, + 'completed_at' => $completedAt, + 'outcome' => CrawlOutcomeEnum::Success, + 'status_code' => 200, + 'error_message' => null, + ]); + + $fresh = $crawl->fresh(); + + $this->assertNotNull($fresh); + + // domain / priority round-trip + $this->assertSame('example.com', $fresh->domain); + $this->assertSame(5, $fresh->priority); + + // outcome is cast to the enum + $this->assertInstanceOf(CrawlOutcomeEnum::class, $fresh->outcome); + $this->assertSame(CrawlOutcomeEnum::Success, $fresh->outcome); + + // datetime casts + $this->assertInstanceOf(Carbon::class, $fresh->completed_at); + + $this->assertTrue($completedAt->equalTo($fresh->completed_at)); + + // nullable columns + $this->assertNull($fresh->error_message); + + // status_code persists + $this->assertSame(200, $fresh->status_code); + } + + public function test_page_crawl_belongs_to_a_page(): void + { + $page = Page::factory()->createQuietly(['url' => 'https://example.com/page-2']); + + $crawl = PageCrawl::create([ + 'page_id' => $page->id, + 'domain' => 'example.com', + 'priority' => 1, + ]); + + $related = $crawl->page; + + $this->assertInstanceOf(Page::class, $related); + $this->assertSame($page->id, $related->id); + } + + public function test_deleting_a_page_cascades_to_its_page_crawls(): void + { + // createQuietly() skips the PageObserver so the count of explicit rows is predictable; + // this test is about cascade delete behaviour, not observer side effects. + $page = Page::factory()->createQuietly(['url' => 'https://example.com/page-cascade']); + + PageCrawl::factory()->page($page)->create(); + PageCrawl::factory()->page($page)->successful()->create(); + PageCrawl::factory()->page($page)->failed('timeout during fetch')->create(); + + $this->assertSame(3, PageCrawl::count()); + + $page->delete(); + + $this->assertSame(0, PageCrawl::count()); + } + + public function test_pending_crawls_are_filtered_by_null_outcome(): void + { + Queue::fake(); + + // createQuietly() skips the PageObserver; this test counts rows with null/non-null + // outcome — the auto-inserted observer crawl (outcome=null) would corrupt both counts. + $page = Page::factory()->createQuietly(['url' => 'https://example.com/page-pending']); + + $pending = PageCrawl::factory()->page($page)->create(); + PageCrawl::factory()->page($page)->successful()->create(); + PageCrawl::factory()->page($page)->failed('connection refused')->create(); + + $this->assertSame(1, PageCrawl::whereNull('outcome')->count()); + $this->assertSame($pending->id, PageCrawl::whereNull('outcome')->first()->id); + + $this->assertSame(2, PageCrawl::whereNotNull('outcome')->count()); + } +} diff --git a/tests/Unit/Models/PageLinkTest.php b/tests/Unit/Models/PageLinkTest.php new file mode 100644 index 0000000..f7ffba2 --- /dev/null +++ b/tests/Unit/Models/PageLinkTest.php @@ -0,0 +1,52 @@ +create(['url' => 'https://source.example.com/post/1']); + $target = Page::factory()->create(['url' => 'https://target.example.com/page/2']); + + $link = PageLink::create([ + 'source_page_id' => $source->id, + 'target_page_id' => $target->id, + ]); + + $fresh = $link->fresh(); + + $this->assertNotNull($fresh); + $this->assertSame($source->id, $fresh->source_page_id); + $this->assertSame($target->id, $fresh->target_page_id); + + $this->assertInstanceOf(Page::class, $fresh->sourcePage); + $this->assertSame($source->id, $fresh->sourcePage->id); + + $this->assertInstanceOf(Page::class, $fresh->targetPage); + $this->assertSame($target->id, $fresh->targetPage->id); + } + + public function test_page_link_factory_with_source_and_target_methods_create_a_link(): void + { + $source = Page::factory()->create(['url' => 'https://source.example.com/post/1']); + $target = Page::factory()->create(['url' => 'https://target.example.com/page/2']); + + $link = PageLink::factory() + ->withSource($source) + ->withTarget($target) + ->create(); + + $this->assertSame($source->id, $link->source_page_id); + $this->assertSame($target->id, $link->target_page_id); + } +} diff --git a/tests/Unit/Models/PageTest.php b/tests/Unit/Models/PageTest.php new file mode 100644 index 0000000..95645ad --- /dev/null +++ b/tests/Unit/Models/PageTest.php @@ -0,0 +1,195 @@ + 'https://example.com/article', + 'status' => 'discovered', + 'title' => 'An Example Article', + 'instance_id' => null, + 'posted_at' => null, + 'fetched_at' => null, + ]); + + $fresh = $page->fresh(); + + $this->assertNotNull($fresh); + $this->assertSame('https://example.com/article', $fresh->url); + $this->assertSame('An Example Article', $fresh->title); + $this->assertNull($fresh->instance_id); + } + + public function test_page_instance_relationship_returns_the_owning_instance(): void + { + $instance = Instance::factory() + ->type(InstanceType::Mastodon) + ->enabled() + ->create(); + + $page = Page::create([ + 'url' => 'https://example.com/post/1', + 'status' => 'discovered', + 'instance_id' => $instance->id, + ]); + + $fresh = $page->fresh(); + + $this->assertInstanceOf(Instance::class, $fresh->instance); + $this->assertSame($instance->id, $fresh->instance->id); + } + + public function test_page_outgoing_and_incoming_links_relationships(): void + { + $source = Page::factory()->create(['url' => 'https://example.com/source']); + $target = Page::factory()->create(['url' => 'https://example.com/target']); + + PageLink::create([ + 'source_page_id' => $source->id, + 'target_page_id' => $target->id, + ]); + + $freshSource = $source->fresh(); + $freshTarget = $target->fresh(); + + $this->assertCount(1, $freshSource->outgoingLinks); + $this->assertCount(0, $freshSource->incomingLinks); + $this->assertCount(1, $freshTarget->incomingLinks); + $this->assertCount(0, $freshTarget->outgoingLinks); + + $this->assertSame($source->id, $freshTarget->incomingLinks->first()->source_page_id); + $this->assertSame($target->id, $freshSource->outgoingLinks->first()->target_page_id); + } + + public function test_page_language_is_fillable_and_persists(): void + { + $page = Page::create([ + 'url' => 'https://example.com/crawled', + 'status' => 'discovered', + 'language' => 'en', + ]); + + $fresh = $page->fresh(); + + $this->assertNotNull($fresh); + $this->assertSame('en', $fresh->language); + + $unset = Page::create([ + 'url' => 'https://example.com/no-language', + 'status' => 'discovered', + ]); + + $this->assertNull($unset->fresh()->language); + } + + public function test_page_has_many_crawls(): void + { + // createQuietly() skips the PageObserver so no auto-crawl row is inserted; + // this test is about HasMany scoping, not observer side effects. + $page = Page::factory()->createQuietly(); + $other = Page::factory()->createQuietly(); + + PageCrawl::create(['page_id' => $page->id, 'domain' => 'example.com']); + PageCrawl::create(['page_id' => $page->id, 'domain' => 'example.com']); + PageCrawl::create(['page_id' => $page->id, 'domain' => 'example.com']); + PageCrawl::create(['page_id' => $other->id, 'domain' => 'other.com']); + + $crawls = $page->fresh()->crawls; + + $this->assertCount(3, $crawls); + foreach ($crawls as $crawl) { + $this->assertInstanceOf(PageCrawl::class, $crawl); + $this->assertSame($page->id, $crawl->page_id); + } + } + + public function test_page_latest_crawl_returns_row_with_latest_created_at(): void + { + // createQuietly() skips the PageObserver; this test is about latestOfMany ordering, + // not observer side effects. Using create() would add an observer crawl whose + // created_at is now(), making the test fragile once the hardcoded sentinel date passes. + $page = Page::factory()->createQuietly(); + + $old = PageCrawl::create(['page_id' => $page->id, 'domain' => 'example.com']); + $old->created_at = Carbon::parse('2026-01-01 08:00:00'); + $old->save(); + + $middle = PageCrawl::create(['page_id' => $page->id, 'domain' => 'example.com']); + $middle->created_at = Carbon::parse('2026-03-15 12:00:00'); + $middle->save(); + + $newest = PageCrawl::create(['page_id' => $page->id, 'domain' => 'example.com', 'error_message' => 'sentinel-latest']); + $newest->created_at = Carbon::parse('2026-05-10 18:00:00'); + $newest->save(); + + $latest = $page->fresh()->latestCrawl; + + $this->assertInstanceOf(PageCrawl::class, $latest); + $this->assertSame('sentinel-latest', $latest->error_message); + } + + public function test_language_confidence_is_fillable_nullable_and_cast_to_float(): void + { + // Column must exist, be nullable (null round-trips cleanly), be mass-assignable, + // and the 'float' cast must be applied so we get a PHP float back, not a string. + $withConfidence = Page::factory()->createQuietly([ + 'language' => 'en', + 'language_confidence' => 0.857, + ]); + + $fresh = $withConfidence->fresh(); + + $this->assertNotNull($fresh); + $this->assertIsFloat($fresh->language_confidence); + $this->assertEqualsWithDelta(0.857, $fresh->language_confidence, 0.001); + + $withoutConfidence = Page::factory()->createQuietly(); + + $this->assertNull($withoutConfidence->fresh()->language_confidence); + } + + public function test_page_status_is_cast_to_enum(): void + { + $cases = [ + ['string' => 'discovered', 'enum' => PageStatusEnum::Discovered], + ['string' => 'fetched', 'enum' => PageStatusEnum::Fetched], + ['string' => 'failed', 'enum' => PageStatusEnum::Failed], + ]; + + foreach ($cases as ['string' => $raw, 'enum' => $expected]) { + $page = Page::create([ + 'url' => 'https://example.com/' . $raw, + 'status' => $raw, + ]); + + $fresh = $page->fresh(); + + $this->assertInstanceOf(PageStatusEnum::class, $fresh->status, "status '{$raw}' should cast to PageStatusEnum"); + $this->assertSame($expected, $fresh->status, "status '{$raw}' should equal PageStatusEnum::{$expected->name}"); + } + } +} diff --git a/tests/Unit/Services/LanguageDetectionServiceTest.php b/tests/Unit/Services/LanguageDetectionServiceTest.php new file mode 100644 index 0000000..62d4453 --- /dev/null +++ b/tests/Unit/Services/LanguageDetectionServiceTest.php @@ -0,0 +1,74 @@ +service = new LanguageDetectionService; + } + + public function test_detects_english_from_english_paragraph(): void + { + $text = 'The solar system is the gravitationally bound system of the Sun and the + objects that orbit it. Of the bodies that orbit the Sun directly, the largest + are the eight planets, with the remainder being smaller objects, the dwarf + planets and small solar system bodies. Planets and most other large bodies + in the solar system orbit the Sun in the same direction, counterclockwise + when viewed from above the Sun\'s north pole.'; + + $result = $this->service->detect($text); + + $this->assertIsArray($result); + $this->assertCount(2, $result); + $this->assertTrue( + str_starts_with($result[0], 'en'), + "Expected an English-family tag, got '{$result[0]}'.", + ); + $this->assertIsFloat($result[1]); + $this->assertGreaterThan(0.0, $result[1]); + $this->assertLessThanOrEqual(1.0, $result[1]); + } + + public function test_detects_portuguese_from_portuguese_paragraph(): void + { + $text = 'O sistema solar é o sistema gravitacionalmente ligado composto pelo Sol e + pelos objetos que orbitam ao seu redor. Dos corpos que orbitam o Sol + diretamente, os maiores são os oito planetas, sendo o restante composto por + objetos menores, como planetas anões e corpos menores do sistema solar. + A Terra é o único planeta conhecido a abrigar vida, possuindo uma atmosfera + rica em nitrogênio e oxigênio que sustenta os seres vivos.'; + + $result = $this->service->detect($text); + + $this->assertIsArray($result); + $this->assertCount(2, $result); + $this->assertTrue( + str_starts_with($result[0], 'pt'), + "Expected a Portuguese-family tag, got '{$result[0]}'.", + ); + $this->assertIsFloat($result[1]); + $this->assertGreaterThan(0.0, $result[1]); + $this->assertLessThanOrEqual(1.0, $result[1]); + } + + public function test_returns_null_for_empty_string(): void + { + $this->assertNull($this->service->detect('')); + } + + public function test_returns_null_for_whitespace_only_string(): void + { + $this->assertNull($this->service->detect(' ')); + } +} diff --git a/tests/Unit/Services/PolitenessServiceTest.php b/tests/Unit/Services/PolitenessServiceTest.php new file mode 100644 index 0000000..ce93fee --- /dev/null +++ b/tests/Unit/Services/PolitenessServiceTest.php @@ -0,0 +1,56 @@ +assertSame(10, (new PolitenessService)->minDelayFor('example.com')); + } + + public function test_min_delay_for_respects_config_override(): void + { + config()->set('crawler.min_domain_delay_seconds', 30); + + $this->assertSame(30, (new PolitenessService)->minDelayFor('example.com')); + } + + public function test_min_delay_for_uses_robots_crawl_delay_when_higher(): void + { + Http::fake([ + 'https://example.com/robots.txt' => Http::response( + // Spatie does exact-token matching (lowercased), so the fixture UA + // must match the full string the service passes to crawlDelayFor(). + "User-agent: TroveBot/0.1 (+https://trove.lvl0.xyz/bot)\nCrawl-delay: 30", + 200, + ), + ]); + + config()->set('crawler.min_domain_delay_seconds', 10); + config()->set('crawler.user_agent', 'TroveBot/0.1 (+https://trove.lvl0.xyz/bot)'); + + $this->assertSame(30, app(PolitenessService::class)->minDelayFor('example.com')); + } + + public function test_min_delay_for_uses_config_when_higher_than_robots(): void + { + Http::fake([ + 'https://example.com/robots.txt' => Http::response( + "User-agent: TroveBot/0.1 (+https://trove.lvl0.xyz/bot)\nCrawl-delay: 10", + 200, + ), + ]); + + config()->set('crawler.min_domain_delay_seconds', 60); + config()->set('crawler.user_agent', 'TroveBot/0.1 (+https://trove.lvl0.xyz/bot)'); + + $this->assertSame(60, app(PolitenessService::class)->minDelayFor('example.com')); + } +} diff --git a/tests/Unit/Services/RobotsServiceTest.php b/tests/Unit/Services/RobotsServiceTest.php new file mode 100644 index 0000000..746c173 --- /dev/null +++ b/tests/Unit/Services/RobotsServiceTest.php @@ -0,0 +1,96 @@ + Http::response( + "User-agent: *\nAllow: /", + 200, + ), + ]); + + $service = app(RobotsService::class); + + $this->assertTrue($service->isAllowed('https://example.com/article', 'TroveBot/0.1')); + } + + public function test_is_allowed_returns_false_when_robots_txt_disallows_path(): void + { + Http::fake([ + 'https://example.com/robots.txt' => Http::response( + "User-agent: *\nDisallow: /", + 200, + ), + ]); + + $service = app(RobotsService::class); + + $this->assertFalse($service->isAllowed('https://example.com/article', 'TroveBot/0.1')); + } + + public function test_is_allowed_returns_true_when_robots_txt_fetch_fails(): void + { + Http::fake([ + 'https://example.com/robots.txt' => Http::response('', 500), + ]); + + $service = app(RobotsService::class); + + $this->assertTrue($service->isAllowed('https://example.com/article', 'TroveBot/0.1')); + } + + public function test_is_allowed_caches_robots_txt_body_per_host(): void + { + Http::fake([ + 'https://example.com/robots.txt' => Http::response( + "User-agent: *\nAllow: /", + 200, + ), + ]); + + $service = app(RobotsService::class); + + $service->isAllowed('https://example.com/article', 'TroveBot/0.1'); + $service->isAllowed('https://example.com/another-article', 'TroveBot/0.1'); + + Http::assertSentCount(1); + } + + public function test_crawl_delay_for_returns_parsed_value(): void + { + Http::fake([ + 'https://example.com/robots.txt' => Http::response( + "User-agent: TroveBot/0.1\nCrawl-delay: 30", + 200, + ), + ]); + + $service = app(RobotsService::class); + + $this->assertSame(30, $service->crawlDelayFor('example.com', 'TroveBot/0.1')); + } + + public function test_crawl_delay_for_returns_null_when_absent(): void + { + Http::fake([ + 'https://example.com/robots.txt' => Http::response( + "User-agent: *\nDisallow: /private", + 200, + ), + ]); + + $service = app(RobotsService::class); + + $this->assertNull($service->crawlDelayFor('example.com', 'TroveBot/0.1')); + } +} diff --git a/tests/Unit/Services/UrlServiceTest.php b/tests/Unit/Services/UrlServiceTest.php new file mode 100644 index 0000000..97ea9a5 --- /dev/null +++ b/tests/Unit/Services/UrlServiceTest.php @@ -0,0 +1,111 @@ +service = new UrlService; + } + + // ------------------------------------------------------------------------- + // Happy path — simple URL + // ------------------------------------------------------------------------- + + public function test_extracts_host_from_simple_url(): void + { + $this->assertSame('example.com', $this->service->host('https://example.com')); + } + + // ------------------------------------------------------------------------- + // Path, query string, and fragment are ignored + // ------------------------------------------------------------------------- + + #[DataProvider('urlsWithNoise')] + public function test_extracts_host_ignoring_path_query_and_fragment(string $url, string $expectedHost): void + { + $this->assertSame($expectedHost, $this->service->host($url)); + } + + public static function urlsWithNoise(): array + { + return [ + 'path only' => ['https://example.com/some/path', 'example.com'], + 'path and query' => ['https://example.com/page?q=hello&lang=en', 'example.com'], + 'path, query, fragment' => ['https://example.com/page?q=1#section', 'example.com'], + 'http scheme with path' => ['http://news.ycombinator.com/item?id=42', 'news.ycombinator.com'], + ]; + } + + // ------------------------------------------------------------------------- + // Port number is stripped from the host + // ------------------------------------------------------------------------- + + public function test_strips_port_from_host(): void + { + $this->assertSame('example.com', $this->service->host('https://example.com:8080/path')); + } + + // ------------------------------------------------------------------------- + // Host is always returned as lowercase + // ------------------------------------------------------------------------- + + public function test_lowercases_host(): void + { + $this->assertSame('example.com', $this->service->host('https://EXAMPLE.COM/path')); + } + + // ------------------------------------------------------------------------- + // Throws on malformed, disallowed, or IP-literal input + // ------------------------------------------------------------------------- + + #[DataProvider('invalidInputs')] + public function test_throws_on_invalid_input(string $url): void + { + $this->expectException(\InvalidArgumentException::class); + + $this->service->host($url); + } + + public static function invalidInputs(): array + { + return [ + // malformed / missing structure + 'empty string' => [''], + 'no scheme' => ['example.com/path'], + 'scheme only' => ['https://'], + 'bare string' => ['not a url at all'], + + // disallowed schemes + 'javascript scheme' => ['javascript:alert(1)'], + 'ftp scheme' => ['ftp://example.com'], + 'data scheme' => ['data:text/html,

hi

'], + + // IP literals — not valid page-URL hosts for Trove's purposes + 'ipv4 literal' => ['https://192.168.1.1/path'], + 'ipv6 literal' => ['https://[::1]/path'], + 'ipv4 without path' => ['http://10.0.0.1'], + + // Embedded credentials (userinfo) — phishing/SSRF flag + 'embedded credentials' => ['https://user:pass@example.com/'], + 'username only' => ['https://user@example.com/'], + + // IPv6 with zone identifier — zone suffix defeats FILTER_VALIDATE_IP + 'ipv6 with zone' => ['https://[fe80::1%25eth0]/'], + + // IPv4-mapped IPv6 — FILTER_VALIDATE_IP recognises ::ffff:x.x.x.x as valid IPv6 + 'ipv4 mapped ipv6' => ['https://[::ffff:192.0.2.1]/path'], + ]; + } +} diff --git a/tests/Unit/ValueObjects/FetchResultTest.php b/tests/Unit/ValueObjects/FetchResultTest.php new file mode 100644 index 0000000..463dcb7 --- /dev/null +++ b/tests/Unit/ValueObjects/FetchResultTest.php @@ -0,0 +1,68 @@ +assertSame(CrawlOutcomeEnum::Success, $result->outcome); + $this->assertSame(200, $result->statusCode); + $this->assertSame('https://example.com/article', $result->finalUrl); + $this->assertSame('An Example Article', $result->title); + $this->assertSame('Lorem ipsum dolor sit amet.', $result->extractedText); + $this->assertInstanceOf(Collection::class, $result->outboundLinks); + $this->assertSame(['https://other.com', 'https://another.com'], $result->outboundLinks->all()); + $this->assertSame(5, $result->wordCount); + $this->assertNull($result->errorMessage); + $this->assertSame('en', $result->language); + $this->assertSame(0.95, $result->languageConfidence); + } + + public function test_it_accepts_null_for_failure_outcome_fields(): void + { + $result = new FetchResult( + outcome: CrawlOutcomeEnum::Failed, + statusCode: null, + finalUrl: null, + title: null, + extractedText: null, + outboundLinks: collect(), + wordCount: null, + errorMessage: 'Could not connect', + language: null, + languageConfidence: null, + ); + + $this->assertSame(CrawlOutcomeEnum::Failed, $result->outcome); + $this->assertNull($result->statusCode); + $this->assertNull($result->finalUrl); + $this->assertNull($result->title); + $this->assertNull($result->extractedText); + $this->assertSame([], $result->outboundLinks->all()); + $this->assertNull($result->wordCount); + $this->assertSame('Could not connect', $result->errorMessage); + $this->assertNull($result->language); + $this->assertNull($result->languageConfidence); + } +}