From f0a8bdc1de95d3f7e78f21e30f6a7b908e431302 Mon Sep 17 00:00:00 2001 From: myrmidex Date: Thu, 23 Apr 2026 19:55:57 +0200 Subject: [PATCH 01/65] 1 - Add production Dockerfile --- .dockerignore | 49 ++++++++++++++ .forgejo/workflows/build.yml | 5 +- docker/prod/Dockerfile | 128 +++++++++++++++++++++++++++++++++++ 3 files changed, 179 insertions(+), 3 deletions(-) create mode 100644 .dockerignore create mode 100644 docker/prod/Dockerfile diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..88f6a9a --- /dev/null +++ b/.dockerignore @@ -0,0 +1,49 @@ +# Version control +.git +.gitignore +.gitattributes + +# Dev environment +shell.nix +Dockerfile.dev +docker/ + +# Tests (not needed in prod image) +tests/ +phpunit.xml +.phpunit.result.cache +phpstan.neon + +# Dependencies (rebuilt during image build) +node_modules/ +vendor/ + +# Build artifacts (frontend stage produces these) +public/build/ +public/hot + +# Editor / OS +.editorconfig +.idea/ +.vscode/ +.DS_Store +*.swp +*.swo + +# Env / secrets +.env +.env.* +!.env.example + +# Logs and runtime caches +storage/logs/*.log +storage/framework/cache/data/ +storage/framework/sessions/ +storage/framework/views/ + +# CI +.forgejo/ + +# Docs / project meta +README.md +LICENSE diff --git a/.forgejo/workflows/build.yml b/.forgejo/workflows/build.yml index 53d63b2..3676bb3 100644 --- a/.forgejo/workflows/build.yml +++ b/.forgejo/workflows/build.yml @@ -5,8 +5,7 @@ on: branches: [main] tags: ['v*'] paths: - - 'Dockerfile' - - 'docker/**' + - 'docker/prod/Dockerfile' - 'app/**' - 'bootstrap/**' - 'config/**' @@ -51,6 +50,6 @@ jobs: uses: https://data.forgejo.org/docker/build-push-action@v5 with: context: . - file: Dockerfile + file: docker/prod/Dockerfile push: true tags: ${{ steps.meta.outputs.tags }} diff --git a/docker/prod/Dockerfile b/docker/prod/Dockerfile new file mode 100644 index 0000000..46d8c2f --- /dev/null +++ b/docker/prod/Dockerfile @@ -0,0 +1,128 @@ +# syntax=docker/dockerfile:1 + +# ============================================================ +# Stage 1: Build frontend assets +# ============================================================ +FROM node:20-alpine AS frontend + +WORKDIR /app + +COPY package.json package-lock.json vite.config.js ./ +COPY resources/ resources/ + +RUN npm ci --no-audit --no-fund +RUN npm run build + +# ============================================================ +# Stage 2: Runtime (FrankenPHP) +# ============================================================ +FROM dunglas/frankenphp:1.1-php8.3-alpine AS runtime + +RUN apk add --no-cache \ + git \ + postgresql-client \ + curl + +RUN install-php-extensions \ + pdo_pgsql \ + redis \ + opcache \ + zip \ + gd \ + intl + +COPY --from=composer:2 /usr/bin/composer /usr/bin/composer + +WORKDIR /app + +ENV APP_ENV=production \ + APP_DEBUG=false \ + LOG_CHANNEL=stack \ + LOG_LEVEL=warning \ + DB_CONNECTION=pgsql \ + DB_HOST=db \ + DB_PORT=5432 \ + REDIS_HOST=redis \ + REDIS_PORT=6379 \ + CACHE_STORE=redis \ + QUEUE_CONNECTION=redis \ + SESSION_DRIVER=redis \ + BROADCAST_CONNECTION=log \ + MAIL_MAILER=log + +# Copy only the files composer needs before install, so the composer layer stays +# cached when application source changes. packages/ is required because composer.json +# declares it as a path repository. +COPY composer.json composer.lock ./ +COPY packages/ packages/ + +# Skip post-autoload scripts (package:discover) during build — they need a runtime +# Laravel boot which fails without proper env. Discovery happens at runtime via +# start-prod.sh. --classmap-authoritative implies --optimize-autoloader. +RUN composer install --no-dev --no-interaction --prefer-dist --classmap-authoritative --no-scripts + +COPY . . +COPY --from=frontend /app/public/build /app/public/build + +RUN chown -R www-data:www-data /app/storage /app/bootstrap/cache + +RUN cat > /etc/caddy/Caddyfile <<'EOF' +{ + frankenphp + order php_server before file_server +} + +:8000 { + root * /app/public + + php_server { + index index.php + } + + encode gzip zstd + + file_server + + header { + X-Frame-Options "SAMEORIGIN" + X-Content-Type-Options "nosniff" + Referrer-Policy "strict-origin-when-cross-origin" + } +} +EOF + +EXPOSE 8000 + +HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \ + CMD curl -fsS http://localhost:8000/up || exit 1 + +RUN cat > /start-prod.sh <<'EOF' +#!/bin/sh +set -e + +echo "Waiting for PostgreSQL at ${DB_HOST}:${DB_PORT}..." +for i in $(seq 1 60); do + if pg_isready -h "${DB_HOST}" -p "${DB_PORT}" -q; then + echo "PostgreSQL is ready." + break + fi + if [ "$i" = "60" ]; then + echo "Timed out waiting for PostgreSQL after 60s." >&2 + exit 1 + fi + sleep 1 +done + +php artisan package:discover --ansi +php artisan config:cache +php artisan route:cache +php artisan view:cache + +php artisan migrate --force + +exec frankenphp run --config /etc/caddy/Caddyfile +EOF + +RUN chmod +x /start-prod.sh + +CMD ["/start-prod.sh"] From 6c643373f24c928db1805ad0cee67ae2a0271da1 Mon Sep 17 00:00:00 2001 From: myrmidex Date: Thu, 23 Apr 2026 19:59:42 +0200 Subject: [PATCH 02/65] 1 - Add README with deployment docs --- README.md | 109 +++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 107 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 4f73b2a..2d5a5d1 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,111 @@ -# trove +# Trove -A small web search engine. +A federated search engine for the small web. Seeded by fediverse attention, ranked by domain coherence rather than commercial authority. + +## Tech stack + +Laravel 13 · Livewire 4 · PostgreSQL 17 (tsvector FTS) · Redis 7 · FrankenPHP · Vite 8 · Tailwind 4. + +## Local development + +Requires [Nix](https://nixos.org/download/) and [Podman](https://podman.io/). + +```sh +nix-shell # enter dev shell +dev-up # start app, db, redis +``` + +App: `http://localhost:8200` · Vite HMR: `http://localhost:5175` + +Other helpers inside the nix shell: `dev-down`, `dev-rebuild`, `dev-shell`, `dev-artisan `, `dev-logs`. + +## Self-hosting + +Trove ships as a Docker image published to `forge.lvl0.xyz/lvl0/trove`. You provide the compose/stack config. + +### Required environment + +| Variable | Purpose | +|---|---| +| `APP_KEY` | Laravel app key. Generate with `docker run --rm forge.lvl0.xyz/lvl0/trove:latest php artisan key:generate --show`. **Must persist across deployments** or sessions/encrypted data break. | +| `APP_URL` | Public URL, e.g. `https://trove.example.org` | +| `DB_DATABASE`, `DB_USERNAME`, `DB_PASSWORD` | PostgreSQL credentials | +| `DB_HOST` | Hostname of the PostgreSQL service. Default `db`. Override if your service is named differently. | +| `REDIS_HOST` | Hostname of the Redis service. Default `redis`. Override if your service is named differently. | + +### Services you need to provide + +- **App**: pull `forge.lvl0.xyz/lvl0/trove:latest` (or a pinned `v*` tag). Exposes port `8000` inside the container. The image runs migrations and warms caches on boot. +- **PostgreSQL 17**. Hostname must be reachable as `db` (default) or set `DB_HOST`. Persist `/var/lib/postgresql/data`. +- **Redis 7** with `--appendonly yes` (queue jobs persist across restarts). Hostname `redis` or set `REDIS_HOST`. + +On first boot the startup script waits for PostgreSQL, warms caches, then runs `php artisan migrate --force` automatically. The 60-second wait loop covers slow PG init; it exits with a clear error if PG never becomes reachable. + +### Volumes + +- `/app/storage` — Laravel writable paths (logs, cached views, uploads). Persist this. + +### Healthcheck + +The image exposes `GET /up` (Laravel's built-in health route). The Dockerfile declares a HEALTHCHECK; your orchestrator can use `curl -fsS http://localhost:8000/up` for liveness. + +### Example compose stack + +A minimal reference — adapt for your infra. DockGE, Portainer, `docker compose`, Kubernetes, and bare `podman play kube` all work with equivalent configs. + +```yaml +services: + app: + image: forge.lvl0.xyz/lvl0/trove:latest + restart: always + ports: ["${APP_PORT:-8400}:8000"] + environment: + APP_KEY: "${APP_KEY}" + APP_URL: "${APP_URL}" + DB_DATABASE: "${DB_DATABASE}" + DB_USERNAME: "${DB_USERNAME}" + DB_PASSWORD: "${DB_PASSWORD}" + volumes: + - app_storage:/app/storage + depends_on: + db: { condition: service_healthy } + redis: { condition: service_healthy } + + db: + image: postgres:17-alpine + restart: always + environment: + POSTGRES_DB: "${DB_DATABASE}" + POSTGRES_USER: "${DB_USERNAME}" + POSTGRES_PASSWORD: "${DB_PASSWORD}" + volumes: + - db_data:/var/lib/postgresql/data + healthcheck: + test: ["CMD-SHELL", "pg_isready -U $$POSTGRES_USER -d $$POSTGRES_DB"] + interval: 10s + retries: 5 + start_period: 10s + + redis: + image: redis:7-alpine + restart: always + command: redis-server --appendonly yes + volumes: + - redis_data:/data + healthcheck: + test: ["CMD", "redis-cli", "ping"] + interval: 10s + retries: 5 + +volumes: + db_data: + redis_data: + app_storage: +``` + +### Upgrades + +Pull the new image tag, recreate the app container. Migrations run on boot (`php artisan migrate --force` in the startup script). Rollback by pointing at the previous `v*` tag. ---- From 3706a81d3c38118e719139ac6c88f57472f3c19a Mon Sep 17 00:00:00 2001 From: myrmidex Date: Thu, 23 Apr 2026 20:48:35 +0200 Subject: [PATCH 03/65] 2 - Populate fedi-discover config with http settings and defaults --- .../Lvl0/FediDiscover/config/fedi-discover.php | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/packages/Lvl0/FediDiscover/config/fedi-discover.php b/packages/Lvl0/FediDiscover/config/fedi-discover.php index 355f9f3..3dff16a 100644 --- a/packages/Lvl0/FediDiscover/config/fedi-discover.php +++ b/packages/Lvl0/FediDiscover/config/fedi-discover.php @@ -3,5 +3,20 @@ declare(strict_types=1); return [ - // Instance list, polling intervals, and HTTP client config land here. + 'http' => [ + 'timeout' => 10, + // Default points at the project site so fediverse admins can always trace a Trove poller + // back to the project. Operators running their own deployment should override this via + // `php artisan vendor:publish --tag=fedi-discover-config` with their own contact URL. + 'user_agent' => 'Trove/1.0 (+https://trove.lvl0.xyz)', + 'max_redirects' => 3, + ], + + 'defaults' => [ + // Minimum recommended: 60. Mastodon/Lemmy rate limits apply per-instance. + 'interval_seconds' => 300, + ], + + // Instances are DB-managed (table: fedi_discover_instances). + // See the Instance model + admin UI (TBD). No instance list here. ]; From 00e28c4868e748a70a5d5e195bdc25d0313fb7e7 Mon Sep 17 00:00:00 2001 From: myrmidex Date: Thu, 23 Apr 2026 20:53:22 +0200 Subject: [PATCH 04/65] 2 - Add fedi_discover_instances migration --- ...7_create_fedi_discover_instances_table.php | 32 +++++++++++++++++++ .../src/FediDiscoverServiceProvider.php | 2 ++ 2 files changed, 34 insertions(+) create mode 100644 packages/Lvl0/FediDiscover/database/migrations/2026_04_23_205027_create_fedi_discover_instances_table.php diff --git a/packages/Lvl0/FediDiscover/database/migrations/2026_04_23_205027_create_fedi_discover_instances_table.php b/packages/Lvl0/FediDiscover/database/migrations/2026_04_23_205027_create_fedi_discover_instances_table.php new file mode 100644 index 0000000..eee4cc8 --- /dev/null +++ b/packages/Lvl0/FediDiscover/database/migrations/2026_04_23_205027_create_fedi_discover_instances_table.php @@ -0,0 +1,32 @@ +id(); + $table->string('type'); + // Instance origin, e.g. https://mastodon.social. Not a full endpoint path. + $table->string('url'); + $table->boolean('enabled')->default(true); + $table->unsignedInteger('interval_seconds')->default(300); + $table->json('config')->default('{}'); + $table->timestampTz('last_polled_at')->nullable(); + $table->timestamps(); + + $table->unique(['type', 'url']); + }); + } + + public function down(): void + { + Schema::dropIfExists('fedi_discover_instances'); + } +}; diff --git a/packages/Lvl0/FediDiscover/src/FediDiscoverServiceProvider.php b/packages/Lvl0/FediDiscover/src/FediDiscoverServiceProvider.php index ced3940..8b069aa 100644 --- a/packages/Lvl0/FediDiscover/src/FediDiscoverServiceProvider.php +++ b/packages/Lvl0/FediDiscover/src/FediDiscoverServiceProvider.php @@ -15,6 +15,8 @@ public function register(): void public function boot(): void { + $this->loadMigrationsFrom(__DIR__.'/../database/migrations'); + if ($this->app->runningInConsole()) { $this->publishes([ __DIR__.'/../config/fedi-discover.php' => config_path('fedi-discover.php'), From 7a2db5a14df20a787044cfed143be19f69d844b5 Mon Sep 17 00:00:00 2001 From: myrmidex Date: Thu, 23 Apr 2026 23:02:50 +0200 Subject: [PATCH 05/65] chore - Set Pint concat_space to spaced + reformat --- bootstrap/app.php | 4 ++-- config/cache.php | 2 +- config/database.php | 2 +- config/filesystems.php | 2 +- config/logging.php | 2 +- config/session.php | 2 +- pint.json | 8 ++++++++ public/index.php | 6 +++--- 8 files changed, 18 insertions(+), 10 deletions(-) create mode 100644 pint.json diff --git a/bootstrap/app.php b/bootstrap/app.php index c183276..0be149f 100644 --- a/bootstrap/app.php +++ b/bootstrap/app.php @@ -6,8 +6,8 @@ return Application::configure(basePath: dirname(__DIR__)) ->withRouting( - web: __DIR__.'/../routes/web.php', - commands: __DIR__.'/../routes/console.php', + web: __DIR__ . '/../routes/web.php', + commands: __DIR__ . '/../routes/console.php', health: '/up', ) ->withMiddleware(function (Middleware $middleware): void { diff --git a/config/cache.php b/config/cache.php index c68acdf..e3584be 100644 --- a/config/cache.php +++ b/config/cache.php @@ -112,7 +112,7 @@ | */ - 'prefix' => env('CACHE_PREFIX', Str::slug((string) env('APP_NAME', 'laravel')).'-cache-'), + 'prefix' => env('CACHE_PREFIX', Str::slug((string) env('APP_NAME', 'laravel')) . '-cache-'), /* |-------------------------------------------------------------------------- diff --git a/config/database.php b/config/database.php index 64709ce..dcf030e 100644 --- a/config/database.php +++ b/config/database.php @@ -149,7 +149,7 @@ 'options' => [ 'cluster' => env('REDIS_CLUSTER', 'redis'), - 'prefix' => env('REDIS_PREFIX', Str::slug((string) env('APP_NAME', 'laravel')).'-database-'), + 'prefix' => env('REDIS_PREFIX', Str::slug((string) env('APP_NAME', 'laravel')) . '-database-'), 'persistent' => env('REDIS_PERSISTENT', false), ], diff --git a/config/filesystems.php b/config/filesystems.php index 37d8fca..aefceac 100644 --- a/config/filesystems.php +++ b/config/filesystems.php @@ -41,7 +41,7 @@ 'public' => [ 'driver' => 'local', 'root' => storage_path('app/public'), - 'url' => rtrim(env('APP_URL', 'http://localhost'), '/').'/storage', + 'url' => rtrim(env('APP_URL', 'http://localhost'), '/') . '/storage', 'visibility' => 'public', 'throw' => false, 'report' => false, diff --git a/config/logging.php b/config/logging.php index b09cb25..b0f50f7 100644 --- a/config/logging.php +++ b/config/logging.php @@ -89,7 +89,7 @@ 'handler_with' => [ 'host' => env('PAPERTRAIL_URL'), 'port' => env('PAPERTRAIL_PORT'), - 'connectionString' => 'tls://'.env('PAPERTRAIL_URL').':'.env('PAPERTRAIL_PORT'), + 'connectionString' => 'tls://' . env('PAPERTRAIL_URL') . ':' . env('PAPERTRAIL_PORT'), ], 'processors' => [PsrLogMessageProcessor::class], ], diff --git a/config/session.php b/config/session.php index f574482..c785fbc 100644 --- a/config/session.php +++ b/config/session.php @@ -129,7 +129,7 @@ 'cookie' => env( 'SESSION_COOKIE', - Str::slug((string) env('APP_NAME', 'laravel')).'-session' + Str::slug((string) env('APP_NAME', 'laravel')) . '-session' ), /* diff --git a/pint.json b/pint.json new file mode 100644 index 0000000..ae7601d --- /dev/null +++ b/pint.json @@ -0,0 +1,8 @@ +{ + "preset": "laravel", + "rules": { + "concat_space": { + "spacing": "one" + } + } +} diff --git a/public/index.php b/public/index.php index ee8f07e..86bfe78 100644 --- a/public/index.php +++ b/public/index.php @@ -6,15 +6,15 @@ define('LARAVEL_START', microtime(true)); // Determine if the application is in maintenance mode... -if (file_exists($maintenance = __DIR__.'/../storage/framework/maintenance.php')) { +if (file_exists($maintenance = __DIR__ . '/../storage/framework/maintenance.php')) { require $maintenance; } // Register the Composer autoloader... -require __DIR__.'/../vendor/autoload.php'; +require __DIR__ . '/../vendor/autoload.php'; // Bootstrap Laravel and handle the request... /** @var Application $app */ -$app = require_once __DIR__.'/../bootstrap/app.php'; +$app = require_once __DIR__ . '/../bootstrap/app.php'; $app->handleRequest(Request::capture()); From bdd2b0f2e542df2551f439d82cee5ba813effde0 Mon Sep 17 00:00:00 2001 From: myrmidex Date: Thu, 23 Apr 2026 23:03:06 +0200 Subject: [PATCH 06/65] 2 - Add InstanceConfig value object and InstanceType enum --- .../src/Config/InstanceConfig.php | 65 ++++++++++ .../FediDiscover/src/Config/InstanceType.php | 10 ++ .../src/FediDiscoverServiceProvider.php | 6 +- .../tests/Unit/InstanceConfigTest.php | 121 ++++++++++++++++++ 4 files changed, 199 insertions(+), 3 deletions(-) create mode 100644 packages/Lvl0/FediDiscover/src/Config/InstanceConfig.php create mode 100644 packages/Lvl0/FediDiscover/src/Config/InstanceType.php create mode 100644 packages/Lvl0/FediDiscover/tests/Unit/InstanceConfigTest.php diff --git a/packages/Lvl0/FediDiscover/src/Config/InstanceConfig.php b/packages/Lvl0/FediDiscover/src/Config/InstanceConfig.php new file mode 100644 index 0000000..b4576a4 --- /dev/null +++ b/packages/Lvl0/FediDiscover/src/Config/InstanceConfig.php @@ -0,0 +1,65 @@ + $extras + */ + public function __construct( + public InstanceType $type, + public string $url, + public bool $enabled, + public int $intervalSeconds, + public array $extras + ) {} + + /** + * @throws InvalidArgumentException + */ + public static function fromArray(array $array): self + { + foreach (['type', 'url', 'enabled', 'interval_seconds'] as $key) { + if (! array_key_exists($key, $array)) { + throw new InvalidArgumentException("Missing required key: {$key}"); + } + } + + if ($array['interval_seconds'] <= 0) { + throw new InvalidArgumentException('Interval seconds needs to be larger than zero'); + } + + $type = InstanceType::tryFrom($array['type']); + if ($type === null) { + throw new InvalidArgumentException('Invalid type: ' . $array['type']); + } + + if (filter_var($array['url'], FILTER_VALIDATE_URL) === false) { + throw new InvalidArgumentException('Invalid URL: ' . $array['url']); + } + + return new self( + type: $type, + url: $array['url'], + enabled: $array['enabled'], + intervalSeconds: $array['interval_seconds'], + extras: $array['extras'] ?? [] + ); + } + + public function toArray(): array + { + return [ + 'type' => $this->type->value, + 'url' => $this->url, + 'enabled' => $this->enabled, + 'interval_seconds' => $this->intervalSeconds, + 'extras' => $this->extras, + ]; + } +} diff --git a/packages/Lvl0/FediDiscover/src/Config/InstanceType.php b/packages/Lvl0/FediDiscover/src/Config/InstanceType.php new file mode 100644 index 0000000..fe70e64 --- /dev/null +++ b/packages/Lvl0/FediDiscover/src/Config/InstanceType.php @@ -0,0 +1,10 @@ +mergeConfigFrom(__DIR__.'/../config/fedi-discover.php', 'fedi-discover'); + $this->mergeConfigFrom(__DIR__ . '/../config/fedi-discover.php', 'fedi-discover'); } public function boot(): void { - $this->loadMigrationsFrom(__DIR__.'/../database/migrations'); + $this->loadMigrationsFrom(__DIR__ . '/../database/migrations'); if ($this->app->runningInConsole()) { $this->publishes([ - __DIR__.'/../config/fedi-discover.php' => config_path('fedi-discover.php'), + __DIR__ . '/../config/fedi-discover.php' => config_path('fedi-discover.php'), ], 'fedi-discover-config'); } } diff --git a/packages/Lvl0/FediDiscover/tests/Unit/InstanceConfigTest.php b/packages/Lvl0/FediDiscover/tests/Unit/InstanceConfigTest.php new file mode 100644 index 0000000..03ffbf7 --- /dev/null +++ b/packages/Lvl0/FediDiscover/tests/Unit/InstanceConfigTest.php @@ -0,0 +1,121 @@ + 'mastodon', + 'url' => 'https://mastodon.social', + 'enabled' => true, + 'interval_seconds' => 600, + 'extras' => ['token' => 'abc123'], + ]); + + $this->assertSame(InstanceType::Mastodon, $config->type); + $this->assertSame('https://mastodon.social', $config->url); + $this->assertTrue($config->enabled); + $this->assertSame(600, $config->intervalSeconds); + $this->assertSame(['token' => 'abc123'], $config->extras); + } + + public function test_from_array_rejects_non_positive_interval_seconds(): void + { + $this->expectException(\InvalidArgumentException::class); + + InstanceConfig::fromArray([ + 'type' => 'mastodon', + 'url' => 'https://mastodon.social', + 'enabled' => true, + 'interval_seconds' => 0, + 'extras' => [], + ]); + } + + public function test_extras_defaults_to_empty_array_when_omitted(): void + { + $config = InstanceConfig::fromArray([ + 'type' => 'mastodon', + 'url' => 'https://mastodon.social', + 'enabled' => true, + 'interval_seconds' => 600, + ]); + + $this->assertSame([], $config->extras); + } + + #[DataProvider('requiredKeyProvider')] + public function test_from_array_throws_when_required_key_is_missing(string $missingKey): void + { + $input = [ + 'type' => 'mastodon', + 'url' => 'https://mastodon.social', + 'enabled' => true, + 'interval_seconds' => 600, + ]; + + unset($input[$missingKey]); + + $this->expectException(\InvalidArgumentException::class); + $this->expectExceptionMessageMatches('/' . preg_quote($missingKey, '/') . '/'); + + InstanceConfig::fromArray($input); + } + + public static function requiredKeyProvider(): array + { + return [ + 'type missing' => ['type'], + 'url missing' => ['url'], + 'enabled missing' => ['enabled'], + 'interval_seconds missing' => ['interval_seconds'], + ]; + } + + public function test_from_array_throws_invalid_argument_exception_for_unknown_type_string(): void + { + $this->expectException(\InvalidArgumentException::class); + $this->expectExceptionMessageMatches('/pleroma/'); + + InstanceConfig::fromArray([ + 'type' => 'pleroma', + 'url' => 'https://pleroma.example.com', + 'enabled' => true, + 'interval_seconds' => 600, + ]); + } + + public function test_from_array_rejects_malformed_url(): void + { + $this->expectException(\InvalidArgumentException::class); + + InstanceConfig::fromArray([ + 'type' => 'mastodon', + 'url' => 'not a url', + 'enabled' => true, + 'interval_seconds' => 600, + ]); + } + + public function test_to_array_produces_array_that_round_trips_through_from_array(): void + { + $original = [ + 'type' => 'mastodon', + 'url' => 'https://mastodon.social', + 'enabled' => true, + 'interval_seconds' => 600, + 'extras' => ['token' => 'abc123'], + ]; + + $this->assertSame($original, InstanceConfig::fromArray($original)->toArray()); + } +} From fc1c8ba0202020b86d3a4639bacb3b2408501db8 Mon Sep 17 00:00:00 2001 From: myrmidex Date: Thu, 23 Apr 2026 23:26:53 +0200 Subject: [PATCH 07/65] 2 - Add Instance Eloquent model with factory --- ...7_create_fedi_discover_instances_table.php | 2 +- .../Database/Factories/InstanceFactory.php | 26 ++++++ .../Lvl0/FediDiscover/src/Models/Instance.php | 38 ++++++++ .../tests/Feature/InstanceModelTest.php | 88 +++++++++++++++++++ 4 files changed, 153 insertions(+), 1 deletion(-) create mode 100644 packages/Lvl0/FediDiscover/src/Database/Factories/InstanceFactory.php create mode 100644 packages/Lvl0/FediDiscover/src/Models/Instance.php create mode 100644 packages/Lvl0/FediDiscover/tests/Feature/InstanceModelTest.php diff --git a/packages/Lvl0/FediDiscover/database/migrations/2026_04_23_205027_create_fedi_discover_instances_table.php b/packages/Lvl0/FediDiscover/database/migrations/2026_04_23_205027_create_fedi_discover_instances_table.php index eee4cc8..841949d 100644 --- a/packages/Lvl0/FediDiscover/database/migrations/2026_04_23_205027_create_fedi_discover_instances_table.php +++ b/packages/Lvl0/FediDiscover/database/migrations/2026_04_23_205027_create_fedi_discover_instances_table.php @@ -17,7 +17,7 @@ public function up(): void $table->string('url'); $table->boolean('enabled')->default(true); $table->unsignedInteger('interval_seconds')->default(300); - $table->json('config')->default('{}'); + $table->json('extras')->default('{}'); $table->timestampTz('last_polled_at')->nullable(); $table->timestamps(); diff --git a/packages/Lvl0/FediDiscover/src/Database/Factories/InstanceFactory.php b/packages/Lvl0/FediDiscover/src/Database/Factories/InstanceFactory.php new file mode 100644 index 0000000..f6dc60d --- /dev/null +++ b/packages/Lvl0/FediDiscover/src/Database/Factories/InstanceFactory.php @@ -0,0 +1,26 @@ + + */ +class InstanceFactory extends Factory +{ + protected $model = Instance::class; + + /** + * @return array + */ + public function definition(): array + { + return [ + // + ]; + } +} diff --git a/packages/Lvl0/FediDiscover/src/Models/Instance.php b/packages/Lvl0/FediDiscover/src/Models/Instance.php new file mode 100644 index 0000000..a9be464 --- /dev/null +++ b/packages/Lvl0/FediDiscover/src/Models/Instance.php @@ -0,0 +1,38 @@ + */ + use HasFactory; + + protected $table = 'fedi_discover_instances'; + + protected $fillable = ['type', 'url', 'enabled', 'interval_seconds', 'extras', 'last_polled_at']; + + protected $casts = [ + 'type' => InstanceType::class, + 'enabled' => 'boolean', + 'extras' => 'array', + 'last_polled_at' => 'datetime', + ]; + + public function scopeEnabled($query) + { + return $query->where('enabled', true); + } + + protected static function newFactory(): Factory + { + return InstanceFactory::new(); + } +} diff --git a/packages/Lvl0/FediDiscover/tests/Feature/InstanceModelTest.php b/packages/Lvl0/FediDiscover/tests/Feature/InstanceModelTest.php new file mode 100644 index 0000000..ae0634e --- /dev/null +++ b/packages/Lvl0/FediDiscover/tests/Feature/InstanceModelTest.php @@ -0,0 +1,88 @@ + InstanceType::Mastodon, + 'url' => 'https://mastodon.social', + 'enabled' => true, + 'interval_seconds' => 600, + 'extras' => ['token' => 'abc123'], + ]); + + $instance = Instance::first(); + + $this->assertNotNull($instance); + $this->assertSame(InstanceType::Mastodon, $instance->type); + $this->assertSame('https://mastodon.social', $instance->url); + $this->assertTrue($instance->enabled); + $this->assertSame(600, $instance->interval_seconds); + $this->assertSame(['token' => 'abc123'], $instance->extras); + } + + public function test_enabled_is_fillable_and_cast_to_boolean(): void + { + $instance = Instance::create([ + 'type' => InstanceType::Mastodon, + 'url' => 'https://mastodon.social', + 'enabled' => false, + 'interval_seconds' => 600, + ]); + + $this->assertFalse($instance->fresh()->enabled); + } + + public function test_last_polled_at_is_fillable_and_cast_to_datetime(): void + { + $polledAt = Carbon::parse('2026-04-23 12:00:00'); + + $instance = Instance::create([ + 'type' => InstanceType::Mastodon, + 'url' => 'https://mastodon.social', + 'enabled' => true, + 'interval_seconds' => 600, + 'last_polled_at' => $polledAt, + ]); + + $fresh = $instance->fresh(); + + $this->assertInstanceOf(Carbon::class, $fresh->last_polled_at); + $this->assertTrue($fresh->last_polled_at->equalTo($polledAt)); + } + + public function test_enabled_scope_returns_only_enabled_instances(): void + { + Instance::create([ + 'type' => InstanceType::Mastodon, + 'url' => 'https://enabled.example', + 'enabled' => true, + 'interval_seconds' => 600, + ]); + + Instance::create([ + 'type' => InstanceType::Mastodon, + 'url' => 'https://disabled.example', + 'enabled' => false, + 'interval_seconds' => 600, + ]); + + $enabled = Instance::enabled()->get(); + + $this->assertCount(1, $enabled); + $this->assertSame('https://enabled.example', $enabled->first()->url); + } +} From 52d6b493cbfe53c924c94bcd0f0cb2a6055ea0ac Mon Sep 17 00:00:00 2001 From: myrmidex Date: Fri, 24 Apr 2026 19:55:43 +0200 Subject: [PATCH 08/65] 2 - Add fedi-discover:validate console command --- .../Commands/ValidateInstancesCommand.php | 64 +++++ .../Database/Factories/InstanceFactory.php | 29 ++- .../src/FediDiscoverServiceProvider.php | 5 + .../Lvl0/FediDiscover/src/Models/Instance.php | 15 +- .../Feature/InstanceConfigPersistenceTest.php | 57 +++++ .../Feature/ValidateInstancesCommandTest.php | 221 ++++++++++++++++++ 6 files changed, 389 insertions(+), 2 deletions(-) create mode 100644 packages/Lvl0/FediDiscover/src/Console/Commands/ValidateInstancesCommand.php create mode 100644 packages/Lvl0/FediDiscover/tests/Feature/InstanceConfigPersistenceTest.php create mode 100644 packages/Lvl0/FediDiscover/tests/Feature/ValidateInstancesCommandTest.php diff --git a/packages/Lvl0/FediDiscover/src/Console/Commands/ValidateInstancesCommand.php b/packages/Lvl0/FediDiscover/src/Console/Commands/ValidateInstancesCommand.php new file mode 100644 index 0000000..99cbcd1 --- /dev/null +++ b/packages/Lvl0/FediDiscover/src/Console/Commands/ValidateInstancesCommand.php @@ -0,0 +1,64 @@ +option('enabled-only')) { + $instances->enabled(); + } + + $instances = $instances->get(); + + $invalidInstances = collect(); + + $instances->each(function (Instance $instance) use ($invalidInstances) { + $reasons = collect(); + + if (filter_var($instance->url, FILTER_VALIDATE_URL) === false) { + $reasons->add('Invalid URL: ' . $instance->url); + } + + if ($instance->interval_seconds < 1) { + $reasons->add('Invalid interval seconds: ' . $instance->interval_seconds); + } + + if ($reasons->isNotEmpty()) { + $invalidInstances->add([ + 'instance' => $instance, + 'reasons' => $reasons, + ]); + } + }); + + $this->info((string) $instances->count()); + $this->info(($instances->count() - $invalidInstances->count()) . ' valid'); + $this->line($invalidInstances->count() . ' invalid'); + + if ($invalidInstances->isNotEmpty()) { + $invalidInstances->each(function (array $instanceArray) { + $instance = $instanceArray['instance']; + $reason = $instanceArray['reasons']->join(', '); + $this->warn($instance->id . ' - ' . $instance->url); + $this->line(' : ' . $reason); + }); + + return self::FAILURE; + } + + return self::SUCCESS; + } +} diff --git a/packages/Lvl0/FediDiscover/src/Database/Factories/InstanceFactory.php b/packages/Lvl0/FediDiscover/src/Database/Factories/InstanceFactory.php index f6dc60d..b8df1f8 100644 --- a/packages/Lvl0/FediDiscover/src/Database/Factories/InstanceFactory.php +++ b/packages/Lvl0/FediDiscover/src/Database/Factories/InstanceFactory.php @@ -5,6 +5,7 @@ namespace Lvl0\FediDiscover\Database\Factories; use Illuminate\Database\Eloquent\Factories\Factory; +use Lvl0\FediDiscover\Config\InstanceType; use Lvl0\FediDiscover\Models\Instance; /** @@ -20,7 +21,33 @@ class InstanceFactory extends Factory public function definition(): array { return [ - // + 'type' => null, + 'url' => fake()->url, + 'enabled' => null, + 'interval_seconds' => 600, + 'extras' => [], + 'last_polled_at' => now(), ]; } + + public function type(InstanceType $type): self + { + return $this->state(fn () => [ + 'type' => $type->value, + ]); + } + + public function enabled(): self + { + return $this->state(fn () => [ + 'enabled' => true, + ]); + } + + public function disabled(): self + { + return $this->state(fn () => [ + 'enabled' => false, + ]); + } } diff --git a/packages/Lvl0/FediDiscover/src/FediDiscoverServiceProvider.php b/packages/Lvl0/FediDiscover/src/FediDiscoverServiceProvider.php index 289b2fd..8eeb73e 100644 --- a/packages/Lvl0/FediDiscover/src/FediDiscoverServiceProvider.php +++ b/packages/Lvl0/FediDiscover/src/FediDiscoverServiceProvider.php @@ -5,6 +5,7 @@ namespace Lvl0\FediDiscover; use Illuminate\Support\ServiceProvider; +use Lvl0\FediDiscover\Console\Commands\ValidateInstancesCommand; class FediDiscoverServiceProvider extends ServiceProvider { @@ -21,6 +22,10 @@ public function boot(): void $this->publishes([ __DIR__ . '/../config/fedi-discover.php' => config_path('fedi-discover.php'), ], 'fedi-discover-config'); + + $this->commands([ + ValidateInstancesCommand::class, + ]); } } } diff --git a/packages/Lvl0/FediDiscover/src/Models/Instance.php b/packages/Lvl0/FediDiscover/src/Models/Instance.php index a9be464..866ef17 100644 --- a/packages/Lvl0/FediDiscover/src/Models/Instance.php +++ b/packages/Lvl0/FediDiscover/src/Models/Instance.php @@ -4,12 +4,25 @@ namespace Lvl0\FediDiscover\Models; +use Illuminate\Database\Eloquent\Builder; use Illuminate\Database\Eloquent\Factories\Factory; use Illuminate\Database\Eloquent\Factories\HasFactory; use Illuminate\Database\Eloquent\Model; +use Illuminate\Support\Carbon; use Lvl0\FediDiscover\Config\InstanceType; use Lvl0\FediDiscover\Database\Factories\InstanceFactory; +/** + * @property int $id + * @property InstanceType $type + * @property string $url + * @property bool $enabled + * @property int $interval_seconds + * @property array $extras + * @property Carbon|null $last_polled_at + * @property Carbon $created_at + * @property Carbon $updated_at + */ class Instance extends Model { /** @use HasFactory */ @@ -26,7 +39,7 @@ class Instance extends Model 'last_polled_at' => 'datetime', ]; - public function scopeEnabled($query) + public function scopeEnabled($query): Builder { return $query->where('enabled', true); } diff --git a/packages/Lvl0/FediDiscover/tests/Feature/InstanceConfigPersistenceTest.php b/packages/Lvl0/FediDiscover/tests/Feature/InstanceConfigPersistenceTest.php new file mode 100644 index 0000000..7f8bf43 --- /dev/null +++ b/packages/Lvl0/FediDiscover/tests/Feature/InstanceConfigPersistenceTest.php @@ -0,0 +1,57 @@ + InstanceType::Mastodon->value, + 'url' => 'https://mastodon.social', + 'enabled' => true, + 'interval_seconds' => 600, + 'extras' => ['token' => 'abc123'], + ]); + + Instance::create($config->toArray()); + + $this->artisan('fedi-discover:validate') + ->assertExitCode(0); + } + + public function test_an_instance_config_survives_a_write_read_cycle_through_the_model(): void + { + $original = InstanceConfig::fromArray([ + 'type' => InstanceType::Mastodon->value, + 'url' => 'https://hachyderm.io', + 'enabled' => false, + 'interval_seconds' => 900, + 'extras' => ['foo' => 'bar'], + ]); + + Instance::create($original->toArray()); + + $instance = Instance::query()->firstOrFail(); + + $roundTripped = InstanceConfig::fromArray([ + 'type' => $instance->type->value, + 'url' => $instance->url, + 'enabled' => $instance->enabled, + 'interval_seconds' => $instance->interval_seconds, + 'extras' => $instance->extras, + ]); + + $this->assertEquals($original, $roundTripped); + } +} diff --git a/packages/Lvl0/FediDiscover/tests/Feature/ValidateInstancesCommandTest.php b/packages/Lvl0/FediDiscover/tests/Feature/ValidateInstancesCommandTest.php new file mode 100644 index 0000000..878d690 --- /dev/null +++ b/packages/Lvl0/FediDiscover/tests/Feature/ValidateInstancesCommandTest.php @@ -0,0 +1,221 @@ +artisan('fedi-discover:validate') + ->assertExitCode(0); + } + + public function test_it_exits_zero_when_all_instances_are_valid(): void + { + Instance::create([ + 'type' => InstanceType::Mastodon, + 'url' => 'https://mastodon.social', + 'enabled' => true, + 'interval_seconds' => 600, + 'extras' => [], + ]); + + $this->artisan('fedi-discover:validate') + ->assertExitCode(0); + } + + public function test_it_exits_nonzero_when_a_row_has_an_invalid_url(): void + { + Instance::create([ + 'type' => InstanceType::Mastodon, + 'url' => 'not-a-url', + 'enabled' => true, + 'interval_seconds' => 600, + 'extras' => [], + ]); + + $this->artisan('fedi-discover:validate') + ->assertExitCode(1); + } + + public function test_it_exits_nonzero_when_a_row_has_a_zero_interval(): void + { + Instance::create([ + 'type' => InstanceType::Mastodon, + 'url' => 'https://mastodon.social', + 'enabled' => true, + 'interval_seconds' => 0, + 'extras' => [], + ]); + + $this->artisan('fedi-discover:validate') + ->assertExitCode(1); + } + + public function test_it_reports_summary_of_valid_and_invalid_counts(): void + { + Instance::create([ + 'type' => InstanceType::Mastodon, + 'url' => 'https://mastodon.social', + 'enabled' => true, + 'interval_seconds' => 600, + 'extras' => [], + ]); + + Instance::create([ + 'type' => InstanceType::Mastodon, + 'url' => 'https://hachyderm.io', + 'enabled' => true, + 'interval_seconds' => 600, + 'extras' => [], + ]); + + Instance::create([ + 'type' => InstanceType::Mastodon, + 'url' => 'bogus', + 'enabled' => true, + 'interval_seconds' => 600, + 'extras' => [], + ]); + + $this->artisan('fedi-discover:validate') + ->expectsOutputToContain('3') + ->expectsOutputToContain('2 valid') + ->expectsOutputToContain('1 invalid') + ->assertExitCode(1); + } + + public function test_it_does_not_fail_fast_and_reports_every_invalid_row(): void + { + Instance::create([ + 'type' => InstanceType::Mastodon, + 'url' => 'bogus-one', + 'enabled' => true, + 'interval_seconds' => 600, + 'extras' => [], + ]); + + $second = Instance::create([ + 'type' => InstanceType::Mastodon, + 'url' => 'https://mastodon.social', + 'enabled' => true, + 'interval_seconds' => 0, + 'extras' => [], + ]); + + $this->artisan('fedi-discover:validate') + ->expectsOutputToContain('bogus-one') + ->expectsOutputToContain((string) $second->id) + ->assertExitCode(1); + } + + public function test_it_includes_the_validation_error_message_for_each_invalid_row(): void + { + Instance::create([ + 'type' => InstanceType::Mastodon, + 'url' => 'not-a-url', + 'enabled' => true, + 'interval_seconds' => 600, + 'extras' => [], + ]); + + $this->artisan('fedi-discover:validate') + ->expectsOutputToContain('Invalid URL: not-a-url') + ->assertExitCode(1); + } + + public function test_summary_counts_are_accurate_when_mixed(): void + { + // 2 valid + Instance::create([ + 'type' => InstanceType::Mastodon, + 'url' => 'https://mastodon.social', + 'enabled' => true, + 'interval_seconds' => 600, + 'extras' => [], + ]); + Instance::create([ + 'type' => InstanceType::Mastodon, + 'url' => 'https://hachyderm.io', + 'enabled' => true, + 'interval_seconds' => 600, + 'extras' => [], + ]); + + // 3 invalid (different defects) + Instance::create([ + 'type' => InstanceType::Mastodon, + 'url' => 'bogus-one', + 'enabled' => true, + 'interval_seconds' => 600, + 'extras' => [], + ]); + Instance::create([ + 'type' => InstanceType::Mastodon, + 'url' => 'https://fosstodon.org', + 'enabled' => true, + 'interval_seconds' => 0, + 'extras' => [], + ]); + Instance::create([ + 'type' => InstanceType::Mastodon, + 'url' => 'also-bad', + 'enabled' => true, + 'interval_seconds' => -5, + 'extras' => [], + ]); + + $this->artisan('fedi-discover:validate') + ->expectsOutputToContain('5') + ->expectsOutputToContain('2 valid') + ->expectsOutputToContain('3 invalid') + ->assertExitCode(1); + } + + public function test_it_exits_zero_with_enabled_only_when_no_enabled_instances_exist(): void + { + Instance::create([ + 'type' => InstanceType::Mastodon, + 'url' => 'https://mastodon.social', + 'enabled' => false, + 'interval_seconds' => 600, + 'extras' => [], + ]); + + $this->artisan('fedi-discover:validate', ['--enabled-only' => true]) + ->assertExitCode(0); + } + + public function test_it_exits_zero_with_an_enabled_only_flag_when_disabled_rows_are_invalid(): void + { + // A disabled row that would fail InstanceConfig validation + Instance::create([ + 'type' => InstanceType::Mastodon, + 'url' => 'broken-and-disabled', + 'enabled' => false, + 'interval_seconds' => 0, + 'extras' => [], + ]); + + // A valid enabled row + Instance::create([ + 'type' => InstanceType::Mastodon, + 'url' => 'https://mastodon.social', + 'enabled' => true, + 'interval_seconds' => 600, + 'extras' => [], + ]); + + $this->artisan('fedi-discover:validate', ['--enabled-only' => true]) + ->assertExitCode(0); + } +} From 3eff919945d343ff3b8d6e9468b317d55cdc1eb5 Mon Sep 17 00:00:00 2001 From: myrmidex Date: Fri, 24 Apr 2026 21:55:57 +0200 Subject: [PATCH 09/65] 3 - Add UrlDiscovered event --- database/factories/PageFactory.php | 24 +++++++++++++ ...7_create_fedi_discover_instances_table.php | 1 + .../Database/Factories/InstanceFactory.php | 1 + .../FediDiscover/src/Events/UrlDiscovered.php | 26 ++++++++++++++ .../src/FediDiscoverServiceProvider.php | 6 ++++ .../Lvl0/FediDiscover/src/Models/Instance.php | 3 +- .../Feature/InstanceConfigPersistenceTest.php | 2 +- .../tests/Feature/InstanceModelTest.php | 25 +++++++++++++ .../tests/Unit/UrlDiscoveredTest.php | 35 +++++++++++++++++++ 9 files changed, 121 insertions(+), 2 deletions(-) create mode 100644 database/factories/PageFactory.php create mode 100644 packages/Lvl0/FediDiscover/src/Events/UrlDiscovered.php create mode 100644 packages/Lvl0/FediDiscover/tests/Unit/UrlDiscoveredTest.php diff --git a/database/factories/PageFactory.php b/database/factories/PageFactory.php new file mode 100644 index 0000000..52302a8 --- /dev/null +++ b/database/factories/PageFactory.php @@ -0,0 +1,24 @@ + + */ +class PageFactory extends Factory +{ + /** + * Define the model's default state. + * + * @return array + */ + public function definition(): array + { + return [ + // + ]; + } +} diff --git a/packages/Lvl0/FediDiscover/database/migrations/2026_04_23_205027_create_fedi_discover_instances_table.php b/packages/Lvl0/FediDiscover/database/migrations/2026_04_23_205027_create_fedi_discover_instances_table.php index 841949d..5a9fb60 100644 --- a/packages/Lvl0/FediDiscover/database/migrations/2026_04_23_205027_create_fedi_discover_instances_table.php +++ b/packages/Lvl0/FediDiscover/database/migrations/2026_04_23_205027_create_fedi_discover_instances_table.php @@ -19,6 +19,7 @@ public function up(): void $table->unsignedInteger('interval_seconds')->default(300); $table->json('extras')->default('{}'); $table->timestampTz('last_polled_at')->nullable(); + $table->string('last_seen_id')->nullable(); $table->timestamps(); $table->unique(['type', 'url']); diff --git a/packages/Lvl0/FediDiscover/src/Database/Factories/InstanceFactory.php b/packages/Lvl0/FediDiscover/src/Database/Factories/InstanceFactory.php index b8df1f8..1b7e74d 100644 --- a/packages/Lvl0/FediDiscover/src/Database/Factories/InstanceFactory.php +++ b/packages/Lvl0/FediDiscover/src/Database/Factories/InstanceFactory.php @@ -26,6 +26,7 @@ public function definition(): array 'enabled' => null, 'interval_seconds' => 600, 'extras' => [], + 'last_seen_id' => null, 'last_polled_at' => now(), ]; } diff --git a/packages/Lvl0/FediDiscover/src/Events/UrlDiscovered.php b/packages/Lvl0/FediDiscover/src/Events/UrlDiscovered.php new file mode 100644 index 0000000..b63eb1e --- /dev/null +++ b/packages/Lvl0/FediDiscover/src/Events/UrlDiscovered.php @@ -0,0 +1,26 @@ +loadMigrationsFrom(__DIR__ . '/../database/migrations'); + Event::listen( + UrlDiscovered::class, + ); + if ($this->app->runningInConsole()) { $this->publishes([ __DIR__ . '/../config/fedi-discover.php' => config_path('fedi-discover.php'), diff --git a/packages/Lvl0/FediDiscover/src/Models/Instance.php b/packages/Lvl0/FediDiscover/src/Models/Instance.php index 866ef17..da751f9 100644 --- a/packages/Lvl0/FediDiscover/src/Models/Instance.php +++ b/packages/Lvl0/FediDiscover/src/Models/Instance.php @@ -19,6 +19,7 @@ * @property bool $enabled * @property int $interval_seconds * @property array $extras + * @property int $last_seen_id * @property Carbon|null $last_polled_at * @property Carbon $created_at * @property Carbon $updated_at @@ -30,7 +31,7 @@ class Instance extends Model protected $table = 'fedi_discover_instances'; - protected $fillable = ['type', 'url', 'enabled', 'interval_seconds', 'extras', 'last_polled_at']; + protected $fillable = ['type', 'url', 'enabled', 'interval_seconds', 'extras', 'last_seen_id', 'last_polled_at']; protected $casts = [ 'type' => InstanceType::class, diff --git a/packages/Lvl0/FediDiscover/tests/Feature/InstanceConfigPersistenceTest.php b/packages/Lvl0/FediDiscover/tests/Feature/InstanceConfigPersistenceTest.php index 7f8bf43..ee35919 100644 --- a/packages/Lvl0/FediDiscover/tests/Feature/InstanceConfigPersistenceTest.php +++ b/packages/Lvl0/FediDiscover/tests/Feature/InstanceConfigPersistenceTest.php @@ -14,7 +14,7 @@ class InstanceConfigPersistenceTest extends TestCase { use RefreshDatabase; - public function test_instance_config_toArray_is_mass_assignable_on_the_model(): void + public function test_instance_config_to_array_is_mass_assignable_on_the_model(): void { $config = InstanceConfig::fromArray([ 'type' => InstanceType::Mastodon->value, diff --git a/packages/Lvl0/FediDiscover/tests/Feature/InstanceModelTest.php b/packages/Lvl0/FediDiscover/tests/Feature/InstanceModelTest.php index ae0634e..d6cde01 100644 --- a/packages/Lvl0/FediDiscover/tests/Feature/InstanceModelTest.php +++ b/packages/Lvl0/FediDiscover/tests/Feature/InstanceModelTest.php @@ -64,6 +64,31 @@ public function test_last_polled_at_is_fillable_and_cast_to_datetime(): void $this->assertTrue($fresh->last_polled_at->equalTo($polledAt)); } + public function test_last_seen_id_defaults_to_null(): void + { + $instance = Instance::create([ + 'type' => InstanceType::Mastodon, + 'url' => 'https://mastodon.social', + 'enabled' => true, + 'interval_seconds' => 600, + ]); + + $this->assertNull($instance->fresh()->last_seen_id); + } + + public function test_last_seen_id_is_fillable_and_persists_as_string(): void + { + $instance = Instance::create([ + 'type' => InstanceType::Mastodon, + 'url' => 'https://mastodon.social', + 'enabled' => true, + 'interval_seconds' => 600, + 'last_seen_id' => '109876543210', + ]); + + $this->assertSame('109876543210', $instance->fresh()->last_seen_id); + } + public function test_enabled_scope_returns_only_enabled_instances(): void { Instance::create([ diff --git a/packages/Lvl0/FediDiscover/tests/Unit/UrlDiscoveredTest.php b/packages/Lvl0/FediDiscover/tests/Unit/UrlDiscoveredTest.php new file mode 100644 index 0000000..c493961 --- /dev/null +++ b/packages/Lvl0/FediDiscover/tests/Unit/UrlDiscoveredTest.php @@ -0,0 +1,35 @@ +assertSame('https://example.com/article', $event->url); + $this->assertSame('https://mastodon.social/@alice/109876543210', $event->postUrl); + $this->assertSame('Check out this article: https://example.com/article', $event->postBody); + } + + public function test_post_body_is_nullable(): void + { + $event = new UrlDiscovered( + url: 'https://example.com/article', + postUrl: 'https://mastodon.social/@alice/109876543210', + postBody: null + ); + + $this->assertNull($event->postBody); + } +} From e5ee0184b5c07331b670644cbcc120e866cb2ac9 Mon Sep 17 00:00:00 2001 From: myrmidex Date: Sat, 25 Apr 2026 02:09:38 +0200 Subject: [PATCH 10/65] 3 - Add PollFediverseAction with FediverseClient interface --- .../src/Actions/PollFediverseAction.php | 55 +++++ .../src/Clients/FediverseClient.php | 12 + .../src/Clients/FediversePost.php | 14 ++ .../FediDiscover/src/Events/UrlDiscovered.php | 4 +- .../Lvl0/FediDiscover/src/Models/Instance.php | 2 +- .../tests/Feature/PollFediverseActionTest.php | 223 ++++++++++++++++++ 6 files changed, 307 insertions(+), 3 deletions(-) create mode 100644 packages/Lvl0/FediDiscover/src/Actions/PollFediverseAction.php create mode 100644 packages/Lvl0/FediDiscover/src/Clients/FediverseClient.php create mode 100644 packages/Lvl0/FediDiscover/src/Clients/FediversePost.php create mode 100644 packages/Lvl0/FediDiscover/tests/Feature/PollFediverseActionTest.php diff --git a/packages/Lvl0/FediDiscover/src/Actions/PollFediverseAction.php b/packages/Lvl0/FediDiscover/src/Actions/PollFediverseAction.php new file mode 100644 index 0000000..3200b87 --- /dev/null +++ b/packages/Lvl0/FediDiscover/src/Actions/PollFediverseAction.php @@ -0,0 +1,55 @@ +client->fetchPostsSince($instance, $instance->last_seen_id)); + + $posts->each(function (FediversePost $post) use ($instance) { + $this->processLinks($post, $instance); + }); + + if ($posts->isNotEmpty()) { + $instance->last_seen_id = $posts->first()->cursorId; + } + + $instance->last_polled_at = now(); + $instance->save(); + } + + private function processLinks(FediversePost $post, Instance $instance): void + { + if ($post->body === null) { + return; + } + + $linksFound = preg_match_all('~https?://[^\s<>"\'()\[\]]+~', $post->body, $matches); + + if ($linksFound === 0) { + return; + } + + $urls = collect($matches[0]) + ->map(fn (string $u) => rtrim($u, '.,;:!?')) + ->filter(fn (string $u) => filter_var($u, FILTER_VALIDATE_URL) !== false) + ->filter(fn (string $u) => parse_url($u, PHP_URL_HOST) !== parse_url($instance->url, PHP_URL_HOST)) + ->unique() + ->each(fn (string $url) => UrlDiscovered::dispatch( + url: $url, + postUrl: $post->selfUrl, + postBody: $post->body, + )); + } +} diff --git a/packages/Lvl0/FediDiscover/src/Clients/FediverseClient.php b/packages/Lvl0/FediDiscover/src/Clients/FediverseClient.php new file mode 100644 index 0000000..c69049d --- /dev/null +++ b/packages/Lvl0/FediDiscover/src/Clients/FediverseClient.php @@ -0,0 +1,12 @@ + $extras - * @property int $last_seen_id + * @property string|null $last_seen_id * @property Carbon|null $last_polled_at * @property Carbon $created_at * @property Carbon $updated_at diff --git a/packages/Lvl0/FediDiscover/tests/Feature/PollFediverseActionTest.php b/packages/Lvl0/FediDiscover/tests/Feature/PollFediverseActionTest.php new file mode 100644 index 0000000..48abc09 --- /dev/null +++ b/packages/Lvl0/FediDiscover/tests/Feature/PollFediverseActionTest.php @@ -0,0 +1,223 @@ +poll([ + new FediversePost('1', 'https://mastodon.social/@alice/1', 'See https://example.com/one and https://other.example/two'), + ]); + + Event::assertDispatched(UrlDiscovered::class, fn (UrlDiscovered $e) => $e->url === 'https://example.com/one'); + Event::assertDispatched(UrlDiscovered::class, fn (UrlDiscovered $e) => $e->url === 'https://other.example/two'); + Event::assertDispatchedTimes(UrlDiscovered::class, 2); + } + + public function test_it_extracts_urls_from_html_anchor_tags(): void + { + Event::fake([UrlDiscovered::class]); + + $this->poll([ + new FediversePost('1', 'https://mastodon.social/@alice/1', '

Check this!

'), + ]); + + Event::assertDispatched(UrlDiscovered::class, fn (UrlDiscovered $e) => $e->url === 'https://example.com/article'); + Event::assertDispatchedTimes(UrlDiscovered::class, 1); + } + + public function test_it_extracts_urls_from_markdown_links(): void + { + Event::fake([UrlDiscovered::class]); + + $this->poll( + posts: [new FediversePost('1', 'https://lemmy.world/post/42', 'A [great article](https://example.com/article) about trees.')], + instanceUrl: 'https://lemmy.world', + ); + + Event::assertDispatched(UrlDiscovered::class, fn (UrlDiscovered $e) => $e->url === 'https://example.com/article'); + Event::assertDispatchedTimes(UrlDiscovered::class, 1); + } + + public function test_it_strips_trailing_punctuation_from_urls(): void + { + Event::fake([UrlDiscovered::class]); + + $this->poll([ + new FediversePost('1', 'https://mastodon.social/@alice/1', 'Check https://example.com/article, it is great. Also https://other.example/page.'), + ]); + + Event::assertDispatched(UrlDiscovered::class, fn (UrlDiscovered $e) => $e->url === 'https://example.com/article'); + Event::assertDispatched(UrlDiscovered::class, fn (UrlDiscovered $e) => $e->url === 'https://other.example/page'); + } + + public function test_it_deduplicates_urls_within_a_single_post(): void + { + Event::fake([UrlDiscovered::class]); + + $this->poll([ + new FediversePost('1', 'https://mastodon.social/@alice/1', 'Here is https://example.com/article and again https://example.com/article'), + ]); + + Event::assertDispatchedTimes(UrlDiscovered::class, 1); + } + + public function test_it_filters_urls_on_the_polling_instance_host(): void + { + Event::fake([UrlDiscovered::class]); + + $this->poll([ + new FediversePost('1', 'https://mastodon.social/@alice/1', 'See https://mastodon.social/@bob/42 and https://example.com/article'), + ]); + + Event::assertDispatched(UrlDiscovered::class, fn (UrlDiscovered $e) => $e->url === 'https://example.com/article'); + Event::assertDispatchedTimes(UrlDiscovered::class, 1); + } + + public function test_it_ignores_posts_with_a_null_body(): void + { + Event::fake([UrlDiscovered::class]); + + $this->poll([ + new FediversePost('1', 'https://mastodon.social/@alice/1', null), + ]); + + Event::assertNotDispatched(UrlDiscovered::class); + } + + public function test_it_ignores_non_http_schemes(): void + { + Event::fake([UrlDiscovered::class]); + + $this->poll([ + new FediversePost('1', 'https://mastodon.social/@alice/1', 'Email mailto:alice@example.com or try ftp://files.example.com/x'), + ]); + + Event::assertNotDispatched(UrlDiscovered::class); + } + + public function test_it_passes_post_self_url_and_body_through_to_the_event(): void + { + Event::fake([UrlDiscovered::class]); + + $body = 'Here is https://example.com/article with surrounding context.'; + + $this->poll([ + new FediversePost('1', 'https://mastodon.social/@alice/1', $body), + ]); + + Event::assertDispatched(UrlDiscovered::class, fn (UrlDiscovered $e) => $e->postUrl === 'https://mastodon.social/@alice/1' && $e->postBody === $body + ); + } + + public function test_it_processes_multiple_posts(): void + { + Event::fake([UrlDiscovered::class]); + + $this->poll([ + new FediversePost('1', 'https://mastodon.social/@alice/1', 'See https://example.com/one'), + new FediversePost('2', 'https://mastodon.social/@bob/2', 'Also https://example.com/two'), + ]); + + Event::assertDispatchedTimes(UrlDiscovered::class, 2); + } + + public function test_it_updates_last_seen_id_to_the_first_posts_cursor(): void + { + $instance = $this->makeInstance(); + + // Clients return newest-first; the action treats posts[0] + // as the new high-water mark without inspecting cursor values. + $this->pollInstance($instance, [ + new FediversePost('newest-cursor', 'https://mastodon.social/@alice/3', 'x'), + new FediversePost('middle-cursor', 'https://mastodon.social/@bob/2', 'y'), + new FediversePost('oldest-cursor', 'https://mastodon.social/@carol/1', 'z'), + ]); + + $this->assertSame('newest-cursor', $instance->fresh()->last_seen_id); + } + + public function test_it_updates_last_polled_at(): void + { + $instance = $this->makeInstance(); + $this->assertNull($instance->last_polled_at); + + $this->pollInstance($instance, [ + new FediversePost('1', 'https://mastodon.social/@alice/1', 'x'), + ]); + + $this->assertNotNull($instance->fresh()->last_polled_at); + } + + public function test_it_passes_the_existing_last_seen_id_to_the_client(): void + { + $instance = $this->makeInstance(['last_seen_id' => '999']); + + $client = Mockery::mock(FediverseClient::class); + $client->shouldReceive('fetchPostsSince') + ->once() + ->with($instance, $instance->last_seen_id) + ->andReturn([]); + + (new PollFediverseAction($client))->execute($instance); + } + + public function test_it_leaves_last_seen_id_unchanged_when_no_posts_are_returned(): void + { + $instance = $this->makeInstance(['last_seen_id' => '500']); + + $this->pollInstance($instance, []); + + $this->assertSame('500', $instance->fresh()->last_seen_id); + } + + /** + * @param array $posts + */ + private function poll(array $posts, string $instanceUrl = 'https://mastodon.social'): void + { + $this->pollInstance($this->makeInstance(['url' => $instanceUrl]), $posts); + } + + /** + * @param array $posts + */ + private function pollInstance(Instance $instance, array $posts): void + { + $client = Mockery::mock(FediverseClient::class); + $client->shouldReceive('fetchPostsSince')->andReturn($posts); + + (new PollFediverseAction($client))->execute($instance); + } + + /** + * @param array $overrides + */ + private function makeInstance(array $overrides = []): Instance + { + return Instance::create(array_merge([ + 'type' => InstanceType::Mastodon, + 'url' => 'https://mastodon.social', + 'enabled' => true, + 'interval_seconds' => 600, + ], $overrides)); + } +} From fea8d48f6e0097afd3425b7ef2f5cc034c441402 Mon Sep 17 00:00:00 2001 From: myrmidex Date: Sat, 25 Apr 2026 03:11:17 +0200 Subject: [PATCH 11/65] 3 - Add MastodonClient with HTTP-faked tests --- .../src/Actions/PollFediverseAction.php | 6 +- .../src/Clients/FediverseClient.php | 12 -- .../src/Clients/FediverseClientInterface.php | 21 +++ .../src/Clients/MastodonClient.php | 34 ++++ .../tests/Feature/MastodonClientTest.php | 165 ++++++++++++++++++ .../tests/Feature/PollFediverseActionTest.php | 10 +- phpunit.xml | 7 + 7 files changed, 235 insertions(+), 20 deletions(-) delete mode 100644 packages/Lvl0/FediDiscover/src/Clients/FediverseClient.php create mode 100644 packages/Lvl0/FediDiscover/src/Clients/FediverseClientInterface.php create mode 100644 packages/Lvl0/FediDiscover/src/Clients/MastodonClient.php create mode 100644 packages/Lvl0/FediDiscover/tests/Feature/MastodonClientTest.php diff --git a/packages/Lvl0/FediDiscover/src/Actions/PollFediverseAction.php b/packages/Lvl0/FediDiscover/src/Actions/PollFediverseAction.php index 3200b87..9d0f75c 100644 --- a/packages/Lvl0/FediDiscover/src/Actions/PollFediverseAction.php +++ b/packages/Lvl0/FediDiscover/src/Actions/PollFediverseAction.php @@ -4,18 +4,18 @@ namespace Lvl0\FediDiscover\Actions; -use Lvl0\FediDiscover\Clients\FediverseClient; +use Lvl0\FediDiscover\Clients\FediverseClientInterface; use Lvl0\FediDiscover\Clients\FediversePost; use Lvl0\FediDiscover\Events\UrlDiscovered; use Lvl0\FediDiscover\Models\Instance; class PollFediverseAction { - public function __construct(private FediverseClient $client) {} + public function __construct(private FediverseClientInterface $client) {} public function execute(Instance $instance): void { - $posts = collect($this->client->fetchPostsSince($instance, $instance->last_seen_id)); + $posts = $this->client->fetchPostsSince($instance, $instance->last_seen_id); $posts->each(function (FediversePost $post) use ($instance) { $this->processLinks($post, $instance); diff --git a/packages/Lvl0/FediDiscover/src/Clients/FediverseClient.php b/packages/Lvl0/FediDiscover/src/Clients/FediverseClient.php deleted file mode 100644 index c69049d..0000000 --- a/packages/Lvl0/FediDiscover/src/Clients/FediverseClient.php +++ /dev/null @@ -1,12 +0,0 @@ - + */ + public function fetchPostsSince(Instance $instance, ?string $lastSeenId): Collection; +} diff --git a/packages/Lvl0/FediDiscover/src/Clients/MastodonClient.php b/packages/Lvl0/FediDiscover/src/Clients/MastodonClient.php new file mode 100644 index 0000000..eedea0e --- /dev/null +++ b/packages/Lvl0/FediDiscover/src/Clients/MastodonClient.php @@ -0,0 +1,34 @@ +url, PHP_URL_HOST) . '/api/v1/timelines/public'; + + $params = $lastSeenId !== null ? ['min_id' => $lastSeenId] : []; + + $response = Http::withHeaders([ + 'User-Agent' => config('fedi-discover.http.user_agent'), + ])->get($url, $params); + + if (! $response->successful() || ! is_array($response->json())) { + return collect(); + } + + return collect($response->json()) + ->map(fn (array $t) => new FediversePost( + cursorId: $t['id'], + selfUrl: $t['url'] ?? $t['uri'], + body: $t['content'] + )); + } +} diff --git a/packages/Lvl0/FediDiscover/tests/Feature/MastodonClientTest.php b/packages/Lvl0/FediDiscover/tests/Feature/MastodonClientTest.php new file mode 100644 index 0000000..d7a56bf --- /dev/null +++ b/packages/Lvl0/FediDiscover/tests/Feature/MastodonClientTest.php @@ -0,0 +1,165 @@ + Http::response([], 200), + ]); + + (new MastodonClient)->fetchPostsSince($this->mastodonInstance(), null); + + Http::assertSent(fn ($request) => str_starts_with($request->url(), 'https://mastodon.social/api/v1/timelines/public') + && $request->method() === 'GET' + ); + } + + public function test_it_omits_min_id_on_first_poll(): void + { + Http::fake(['*' => Http::response([], 200)]); + + (new MastodonClient)->fetchPostsSince($this->mastodonInstance(), null); + + Http::assertSent(fn ($request) => ! str_contains($request->url(), 'min_id')); + } + + public function test_it_passes_min_id_on_subsequent_polls(): void + { + Http::fake(['*' => Http::response([], 200)]); + + (new MastodonClient)->fetchPostsSince($this->mastodonInstance(), '109876543210'); + + Http::assertSent(fn ($request) => str_contains($request->url(), 'min_id=109876543210')); + } + + public function test_it_returns_an_empty_collection_when_the_api_returns_no_posts(): void + { + Http::fake(['*' => Http::response([], 200)]); + + $posts = (new MastodonClient)->fetchPostsSince($this->mastodonInstance(), null); + + $this->assertInstanceOf(Collection::class, $posts); + $this->assertTrue($posts->isEmpty()); + } + + public function test_it_maps_each_status_to_a_fediverse_post(): void + { + Http::fake([ + '*' => Http::response([ + $this->mastodonStatus(id: '109876543210', url: 'https://mastodon.social/@alice/109876543210', content: '

Hello

'), + $this->mastodonStatus(id: '109876543211', url: 'https://mastodon.social/@bob/109876543211', content: '

World

'), + ], 200), + ]); + + $posts = (new MastodonClient)->fetchPostsSince($this->mastodonInstance(), null); + + $this->assertCount(2, $posts); + $this->assertInstanceOf(FediversePost::class, $posts->first()); + $this->assertSame('109876543210', $posts->first()->cursorId); + $this->assertSame('https://mastodon.social/@alice/109876543210', $posts->first()->selfUrl); + $this->assertSame('

Hello

', $posts->first()->body); + } + + public function test_it_falls_back_to_uri_when_url_is_null(): void + { + Http::fake([ + '*' => Http::response([ + $this->mastodonStatus( + id: '109876543210', + url: null, + uri: 'https://hachyderm.io/users/bob/statuses/5678', + content: '

federated post

' + ), + ], 200), + ]); + + $posts = (new MastodonClient)->fetchPostsSince($this->mastodonInstance(), null); + + $this->assertSame('https://hachyderm.io/users/bob/statuses/5678', $posts->first()->selfUrl); + } + + public function test_it_preserves_newest_first_ordering_from_the_api(): void + { + Http::fake([ + '*' => Http::response([ + $this->mastodonStatus(id: '300', url: 'https://mastodon.social/@a/300'), + $this->mastodonStatus(id: '200', url: 'https://mastodon.social/@b/200'), + $this->mastodonStatus(id: '100', url: 'https://mastodon.social/@c/100'), + ], 200), + ]); + + $posts = (new MastodonClient)->fetchPostsSince($this->mastodonInstance(), null); + + $this->assertSame(['300', '200', '100'], $posts->pluck('cursorId')->all()); + } + + public function test_it_returns_an_empty_collection_on_a_non_2xx_response(): void + { + Http::fake(['*' => Http::response('Too many requests', 429)]); + + $posts = (new MastodonClient)->fetchPostsSince($this->mastodonInstance(), null); + + $this->assertInstanceOf(Collection::class, $posts); + $this->assertTrue($posts->isEmpty()); + } + + public function test_it_returns_an_empty_collection_when_the_response_body_is_not_json(): void + { + Http::fake(['*' => Http::response('error', 200)]); + + $posts = (new MastodonClient)->fetchPostsSince($this->mastodonInstance(), null); + + $this->assertInstanceOf(Collection::class, $posts); + $this->assertTrue($posts->isEmpty()); + } + + public function test_it_sends_the_configured_user_agent(): void + { + Http::fake(['*' => Http::response([], 200)]); + + (new MastodonClient)->fetchPostsSince($this->mastodonInstance(), null); + + $expected = config('fedi-discover.http.user_agent'); + Http::assertSent(fn ($request) => $request->header('User-Agent')[0] === $expected); + } + + private function mastodonInstance(): Instance + { + return new Instance([ + 'type' => InstanceType::Mastodon, + 'url' => 'https://mastodon.social', + ]); + } + + /** + * @return array + */ + private function mastodonStatus( + string $id, + ?string $url = null, + ?string $uri = null, + string $content = '

example

', + ): array { + return [ + 'id' => $id, + 'url' => $url, + 'uri' => $uri ?? "https://mastodon.social/users/x/statuses/{$id}", + 'content' => $content, + 'created_at' => '2026-04-25T10:00:00Z', + 'account' => ['acct' => 'alice@mastodon.social'], + ]; + } +} diff --git a/packages/Lvl0/FediDiscover/tests/Feature/PollFediverseActionTest.php b/packages/Lvl0/FediDiscover/tests/Feature/PollFediverseActionTest.php index 48abc09..9d52f77 100644 --- a/packages/Lvl0/FediDiscover/tests/Feature/PollFediverseActionTest.php +++ b/packages/Lvl0/FediDiscover/tests/Feature/PollFediverseActionTest.php @@ -7,7 +7,7 @@ use Illuminate\Foundation\Testing\RefreshDatabase; use Illuminate\Support\Facades\Event; use Lvl0\FediDiscover\Actions\PollFediverseAction; -use Lvl0\FediDiscover\Clients\FediverseClient; +use Lvl0\FediDiscover\Clients\FediverseClientInterface; use Lvl0\FediDiscover\Clients\FediversePost; use Lvl0\FediDiscover\Config\InstanceType; use Lvl0\FediDiscover\Events\UrlDiscovered; @@ -171,11 +171,11 @@ public function test_it_passes_the_existing_last_seen_id_to_the_client(): void { $instance = $this->makeInstance(['last_seen_id' => '999']); - $client = Mockery::mock(FediverseClient::class); + $client = Mockery::mock(FediverseClientInterface::class); $client->shouldReceive('fetchPostsSince') ->once() ->with($instance, $instance->last_seen_id) - ->andReturn([]); + ->andReturn(collect()); (new PollFediverseAction($client))->execute($instance); } @@ -202,8 +202,8 @@ private function poll(array $posts, string $instanceUrl = 'https://mastodon.soci */ private function pollInstance(Instance $instance, array $posts): void { - $client = Mockery::mock(FediverseClient::class); - $client->shouldReceive('fetchPostsSince')->andReturn($posts); + $client = Mockery::mock(FediverseClientInterface::class); + $client->shouldReceive('fetchPostsSince')->andReturn(collect($posts)); (new PollFediverseAction($client))->execute($instance); } diff --git a/phpunit.xml b/phpunit.xml index 46d97dd..9ca208c 100644 --- a/phpunit.xml +++ b/phpunit.xml @@ -3,6 +3,11 @@ xsi:noNamespaceSchemaLocation="vendor/phpunit/phpunit/phpunit.xsd" bootstrap="vendor/autoload.php" colors="true" + processIsolation="false" + displayDetailsOnPhpunitDeprecations="true" + displayDetailsOnTestsThatTriggerErrors="true" + displayDetailsOnTestsThatTriggerWarnings="true" + displayDetailsOnTestsThatTriggerNotices="true" > @@ -36,5 +41,7 @@ + + From 1b652752e1ca635d18ec8131d115f718c257a5f9 Mon Sep 17 00:00:00 2001 From: myrmidex Date: Sat, 25 Apr 2026 10:27:29 +0200 Subject: [PATCH 12/65] 3 - Add fedi-discover:poll command with failure isolation --- .../Console/Commands/PollInstancesCommand.php | 44 +++++ .../src/FediDiscoverServiceProvider.php | 7 + .../Feature/PollInstancesCommandTest.php | 150 ++++++++++++++++++ 3 files changed, 201 insertions(+) create mode 100644 packages/Lvl0/FediDiscover/src/Console/Commands/PollInstancesCommand.php create mode 100644 packages/Lvl0/FediDiscover/tests/Feature/PollInstancesCommandTest.php diff --git a/packages/Lvl0/FediDiscover/src/Console/Commands/PollInstancesCommand.php b/packages/Lvl0/FediDiscover/src/Console/Commands/PollInstancesCommand.php new file mode 100644 index 0000000..0f8d7d9 --- /dev/null +++ b/packages/Lvl0/FediDiscover/src/Console/Commands/PollInstancesCommand.php @@ -0,0 +1,44 @@ +get() + ->each(function (Instance $instance) use (&$hadFailure) { + try { + $this->action->execute($instance); + } catch (Throwable $e) { + $hadFailure = true; + } + }); + + if ($hadFailure) { + return self::FAILURE; + } + + return self::SUCCESS; + } +} diff --git a/packages/Lvl0/FediDiscover/src/FediDiscoverServiceProvider.php b/packages/Lvl0/FediDiscover/src/FediDiscoverServiceProvider.php index 203355c..9fcb44f 100644 --- a/packages/Lvl0/FediDiscover/src/FediDiscoverServiceProvider.php +++ b/packages/Lvl0/FediDiscover/src/FediDiscoverServiceProvider.php @@ -6,6 +6,7 @@ use Illuminate\Support\Facades\Event; use Illuminate\Support\ServiceProvider; +use Lvl0\FediDiscover\Console\Commands\PollInstancesCommand; use Lvl0\FediDiscover\Console\Commands\ValidateInstancesCommand; use Lvl0\FediDiscover\Events\UrlDiscovered; @@ -14,6 +15,11 @@ class FediDiscoverServiceProvider extends ServiceProvider public function register(): void { $this->mergeConfigFrom(__DIR__ . '/../config/fedi-discover.php', 'fedi-discover'); + + $this->app->bind( + \Lvl0\FediDiscover\Clients\FediverseClientInterface::class, + \Lvl0\FediDiscover\Clients\MastodonClient::class, + ); } public function boot(): void @@ -30,6 +36,7 @@ public function boot(): void ], 'fedi-discover-config'); $this->commands([ + PollInstancesCommand::class, ValidateInstancesCommand::class, ]); } diff --git a/packages/Lvl0/FediDiscover/tests/Feature/PollInstancesCommandTest.php b/packages/Lvl0/FediDiscover/tests/Feature/PollInstancesCommandTest.php new file mode 100644 index 0000000..31ac841 --- /dev/null +++ b/packages/Lvl0/FediDiscover/tests/Feature/PollInstancesCommandTest.php @@ -0,0 +1,150 @@ +shouldReceive('fetchPostsSince')->andReturn(collect()); + + $this->app->bind(FediverseClientInterface::class, fn () => $stub); + } + + public function test_it_exits_zero_when_there_are_no_enabled_instances(): void + { + $this->artisan('fedi-discover:poll') + ->assertExitCode(0); + } + + public function test_it_calls_the_action_for_each_enabled_instance_and_skips_disabled(): void + { + $enabled1 = Instance::create([ + 'type' => InstanceType::Mastodon, + 'url' => 'https://mastodon.social', + 'enabled' => true, + 'interval_seconds' => 600, + ]); + + $enabled2 = Instance::create([ + 'type' => InstanceType::Mastodon, + 'url' => 'https://fosstodon.org', + 'enabled' => true, + 'interval_seconds' => 600, + ]); + + Instance::create([ + 'type' => InstanceType::Mastodon, + 'url' => 'https://disabled.example', + 'enabled' => false, + 'interval_seconds' => 600, + ]); + + $calledWith = []; + + $action = Mockery::mock(PollFediverseAction::class); + $action->shouldReceive('execute') + ->twice() + ->withArgs(function (Instance $instance) use (&$calledWith): bool { + $calledWith[] = $instance->url; + + return true; + }); + + $this->app->instance(PollFediverseAction::class, $action); + + $this->artisan('fedi-discover:poll')->assertExitCode(0); + + $this->assertEqualsCanonicalizing( + [$enabled1->url, $enabled2->url], + $calledWith, + ); + } + + public function test_one_instance_throwing_does_not_stop_remaining_instances_from_being_polled(): void + { + Instance::create([ + 'type' => InstanceType::Mastodon, + 'url' => 'https://failing.example', + 'enabled' => true, + 'interval_seconds' => 600, + ]); + + $healthy = Instance::create([ + 'type' => InstanceType::Mastodon, + 'url' => 'https://healthy.example', + 'enabled' => true, + 'interval_seconds' => 600, + ]); + + $calledWith = []; + + $action = Mockery::mock(PollFediverseAction::class); + $action->shouldReceive('execute') + ->twice() + ->andReturnUsing(function (Instance $instance) use (&$calledWith): void { + $calledWith[] = $instance->url; + + if ($instance->url === 'https://failing.example') { + throw new RuntimeException('Connection refused'); + } + }); + + $this->app->instance(PollFediverseAction::class, $action); + + $this->artisan('fedi-discover:poll')->assertExitCode(1); + + $this->assertEqualsCanonicalizing( + ['https://failing.example', $healthy->url], + $calledWith, + ); + } + + public function test_it_exits_one_when_at_least_one_instance_fails(): void + { + Instance::create([ + 'type' => InstanceType::Mastodon, + 'url' => 'https://failing.example', + 'enabled' => true, + 'interval_seconds' => 600, + ]); + + Instance::create([ + 'type' => InstanceType::Mastodon, + 'url' => 'https://healthy.example', + 'enabled' => true, + 'interval_seconds' => 600, + ]); + + $action = Mockery::mock(PollFediverseAction::class); + $action->shouldReceive('execute') + ->twice() + ->andReturnUsing(function (Instance $instance): void { + if ($instance->url === 'https://failing.example') { + throw new RuntimeException('Connection refused'); + } + }); + + $this->app->instance(PollFediverseAction::class, $action); + + $this->artisan('fedi-discover:poll')->assertExitCode(1); + } +} From 1b713e353998a58e79873b02fbfa8db0198f1d59 Mon Sep 17 00:00:00 2001 From: myrmidex Date: Sun, 26 Apr 2026 00:42:21 +0200 Subject: [PATCH 13/65] 3 - Add LemmyClient with FediverseClientFactory dispatch --- .../src/Actions/PollFediverseAction.php | 11 +- .../src/Clients/FediverseClientFactory.php | 24 +++ .../FediDiscover/src/Clients/LemmyClient.php | 43 +++++ .../src/Clients/MastodonClient.php | 4 +- .../FediDiscover/src/Config/InstanceType.php | 1 + .../src/FediDiscoverServiceProvider.php | 6 +- .../FediversePost.php | 6 +- .../Feature/FediverseClientFactoryTest.php | 45 ++++++ .../tests/Feature/LemmyClientTest.php | 150 ++++++++++++++++++ .../tests/Feature/MastodonClientTest.php | 28 +++- .../tests/Feature/PollFediverseActionTest.php | 13 +- .../Feature/PollInstancesCommandTest.php | 12 +- 12 files changed, 323 insertions(+), 20 deletions(-) create mode 100644 packages/Lvl0/FediDiscover/src/Clients/FediverseClientFactory.php create mode 100644 packages/Lvl0/FediDiscover/src/Clients/LemmyClient.php rename packages/Lvl0/FediDiscover/src/{Clients => ValueObjects}/FediversePost.php (50%) create mode 100644 packages/Lvl0/FediDiscover/tests/Feature/FediverseClientFactoryTest.php create mode 100644 packages/Lvl0/FediDiscover/tests/Feature/LemmyClientTest.php diff --git a/packages/Lvl0/FediDiscover/src/Actions/PollFediverseAction.php b/packages/Lvl0/FediDiscover/src/Actions/PollFediverseAction.php index 9d0f75c..2c36212 100644 --- a/packages/Lvl0/FediDiscover/src/Actions/PollFediverseAction.php +++ b/packages/Lvl0/FediDiscover/src/Actions/PollFediverseAction.php @@ -4,18 +4,19 @@ namespace Lvl0\FediDiscover\Actions; -use Lvl0\FediDiscover\Clients\FediverseClientInterface; -use Lvl0\FediDiscover\Clients\FediversePost; +use Lvl0\FediDiscover\Clients\FediverseClientFactory; use Lvl0\FediDiscover\Events\UrlDiscovered; use Lvl0\FediDiscover\Models\Instance; +use Lvl0\FediDiscover\ValueObjects\FediversePost; class PollFediverseAction { - public function __construct(private FediverseClientInterface $client) {} + public function __construct(private FediverseClientFactory $factory) {} public function execute(Instance $instance): void { - $posts = $this->client->fetchPostsSince($instance, $instance->last_seen_id); + $client = $this->factory->for($instance); + $posts = $client->fetchPostsSince($instance, $instance->last_seen_id); $posts->each(function (FediversePost $post) use ($instance) { $this->processLinks($post, $instance); @@ -41,7 +42,7 @@ private function processLinks(FediversePost $post, Instance $instance): void return; } - $urls = collect($matches[0]) + collect($matches[0]) ->map(fn (string $u) => rtrim($u, '.,;:!?')) ->filter(fn (string $u) => filter_var($u, FILTER_VALIDATE_URL) !== false) ->filter(fn (string $u) => parse_url($u, PHP_URL_HOST) !== parse_url($instance->url, PHP_URL_HOST)) diff --git a/packages/Lvl0/FediDiscover/src/Clients/FediverseClientFactory.php b/packages/Lvl0/FediDiscover/src/Clients/FediverseClientFactory.php new file mode 100644 index 0000000..5cb96ca --- /dev/null +++ b/packages/Lvl0/FediDiscover/src/Clients/FediverseClientFactory.php @@ -0,0 +1,24 @@ +type) { + InstanceType::Mastodon => $this->mastodonClient, + InstanceType::Lemmy => $this->lemmyClient, + }; + } +} diff --git a/packages/Lvl0/FediDiscover/src/Clients/LemmyClient.php b/packages/Lvl0/FediDiscover/src/Clients/LemmyClient.php new file mode 100644 index 0000000..7551b08 --- /dev/null +++ b/packages/Lvl0/FediDiscover/src/Clients/LemmyClient.php @@ -0,0 +1,43 @@ +url, PHP_URL_HOST) . '/api/v3/post/list'; + + $params = $lastSeenId !== null ? ['min_id' => $lastSeenId] : []; + + $response = Http::withHeaders([ + 'User-Agent' => config('fedi-discover.http.user_agent'), + ])->get($url, $params); + + if (! $response->successful() || ! is_array($response->json())) { + return collect(); + } + + return collect($response->json()['posts']) + ->map(fn (array $p) => $p['post']) + ->map(function (array $t) { + $parts = array_filter([$t['body'] ?? null, $t['url'] ?? null]); + $body = $parts ? implode(' ', $parts) : null; + + return new FediversePost( + cursorId: (string) $t['id'], + selfUrl: $t['ap_id'], + body: $body, + title: $t['name'], + publishedAt: $t['published'] + ); + }); + } +} diff --git a/packages/Lvl0/FediDiscover/src/Clients/MastodonClient.php b/packages/Lvl0/FediDiscover/src/Clients/MastodonClient.php index eedea0e..d6e58e5 100644 --- a/packages/Lvl0/FediDiscover/src/Clients/MastodonClient.php +++ b/packages/Lvl0/FediDiscover/src/Clients/MastodonClient.php @@ -7,6 +7,7 @@ use Illuminate\Support\Collection; use Illuminate\Support\Facades\Http; use Lvl0\FediDiscover\Models\Instance; +use Lvl0\FediDiscover\ValueObjects\FediversePost; class MastodonClient implements FediverseClientInterface { @@ -28,7 +29,8 @@ public function fetchPostsSince(Instance $instance, ?string $lastSeenId): Collec ->map(fn (array $t) => new FediversePost( cursorId: $t['id'], selfUrl: $t['url'] ?? $t['uri'], - body: $t['content'] + body: $t['content'], + publishedAt: $t['created_at'] ?? null )); } } diff --git a/packages/Lvl0/FediDiscover/src/Config/InstanceType.php b/packages/Lvl0/FediDiscover/src/Config/InstanceType.php index fe70e64..b7c4fce 100644 --- a/packages/Lvl0/FediDiscover/src/Config/InstanceType.php +++ b/packages/Lvl0/FediDiscover/src/Config/InstanceType.php @@ -7,4 +7,5 @@ enum InstanceType: string { case Mastodon = 'mastodon'; + case Lemmy = 'lemmy'; } diff --git a/packages/Lvl0/FediDiscover/src/FediDiscoverServiceProvider.php b/packages/Lvl0/FediDiscover/src/FediDiscoverServiceProvider.php index 9fcb44f..27cd9ec 100644 --- a/packages/Lvl0/FediDiscover/src/FediDiscoverServiceProvider.php +++ b/packages/Lvl0/FediDiscover/src/FediDiscoverServiceProvider.php @@ -6,6 +6,7 @@ use Illuminate\Support\Facades\Event; use Illuminate\Support\ServiceProvider; +use Lvl0\FediDiscover\Clients\FediverseClientFactory; use Lvl0\FediDiscover\Console\Commands\PollInstancesCommand; use Lvl0\FediDiscover\Console\Commands\ValidateInstancesCommand; use Lvl0\FediDiscover\Events\UrlDiscovered; @@ -16,10 +17,7 @@ public function register(): void { $this->mergeConfigFrom(__DIR__ . '/../config/fedi-discover.php', 'fedi-discover'); - $this->app->bind( - \Lvl0\FediDiscover\Clients\FediverseClientInterface::class, - \Lvl0\FediDiscover\Clients\MastodonClient::class, - ); + $this->app->singleton(FediverseClientFactory::class); } public function boot(): void diff --git a/packages/Lvl0/FediDiscover/src/Clients/FediversePost.php b/packages/Lvl0/FediDiscover/src/ValueObjects/FediversePost.php similarity index 50% rename from packages/Lvl0/FediDiscover/src/Clients/FediversePost.php rename to packages/Lvl0/FediDiscover/src/ValueObjects/FediversePost.php index fa1c87e..e8d0423 100644 --- a/packages/Lvl0/FediDiscover/src/Clients/FediversePost.php +++ b/packages/Lvl0/FediDiscover/src/ValueObjects/FediversePost.php @@ -2,13 +2,15 @@ declare(strict_types=1); -namespace Lvl0\FediDiscover\Clients; +namespace Lvl0\FediDiscover\ValueObjects; class FediversePost { public function __construct( public string $cursorId, public string $selfUrl, - public ?string $body + public ?string $body = null, + public ?string $title = null, + public ?string $publishedAt = null, ) {} } diff --git a/packages/Lvl0/FediDiscover/tests/Feature/FediverseClientFactoryTest.php b/packages/Lvl0/FediDiscover/tests/Feature/FediverseClientFactoryTest.php new file mode 100644 index 0000000..06e278a --- /dev/null +++ b/packages/Lvl0/FediDiscover/tests/Feature/FediverseClientFactoryTest.php @@ -0,0 +1,45 @@ + InstanceType::Mastodon, 'url' => 'https://mastodon.social']); + + $client = $factory->for($instance); + + $this->assertInstanceOf(MastodonClient::class, $client); + } + + public function test_it_resolves_lemmy_client_for_lemmy_instance_type(): void + { + $factory = app(FediverseClientFactory::class); + + $instance = new Instance(['type' => InstanceType::Lemmy, 'url' => 'https://lemmy.world']); + + $client = $factory->for($instance); + + $this->assertInstanceOf(LemmyClient::class, $client); + } + + public function test_it_is_registered_as_a_singleton_in_the_container(): void + { + $a = $this->app->make(FediverseClientFactory::class); + $b = $this->app->make(FediverseClientFactory::class); + + $this->assertSame($a, $b); + } +} diff --git a/packages/Lvl0/FediDiscover/tests/Feature/LemmyClientTest.php b/packages/Lvl0/FediDiscover/tests/Feature/LemmyClientTest.php new file mode 100644 index 0000000..e06b6b1 --- /dev/null +++ b/packages/Lvl0/FediDiscover/tests/Feature/LemmyClientTest.php @@ -0,0 +1,150 @@ + Http::response([ + 'posts' => [ + $this->lemmyPost( + id: 42, + apId: 'https://lemmy.world/post/42', + name: 'My Great Post', + body: 'Some body text', + published: '2026-04-25T10:00:00.000000', + ), + ], + ], 200), + ]); + + $posts = (new LemmyClient)->fetchPostsSince($this->lemmyInstance(), null); + + $this->assertCount(1, $posts); + $this->assertInstanceOf(FediversePost::class, $posts->first()); + $this->assertSame('42', $posts->first()->cursorId); + $this->assertSame('https://lemmy.world/post/42', $posts->first()->selfUrl); + $this->assertSame('My Great Post', $posts->first()->title); + $this->assertSame('Some body text', $posts->first()->body); + $this->assertSame('2026-04-25T10:00:00.000000', $posts->first()->publishedAt); + } + + public function test_url_field_is_appended_to_body(): void + { + Http::fake([ + '*' => Http::response([ + 'posts' => [ + $this->lemmyPost( + id: 42, + apId: 'https://lemmy.world/post/42', + url: 'https://example-garden.blog/post-42', + body: 'Some original text.', + ), + ], + ], 200), + ]); + + $post = (new LemmyClient)->fetchPostsSince($this->lemmyInstance(), null)->first(); + + $this->assertStringContainsString('Some original text.', $post->body); + $this->assertStringContainsString('https://example-garden.blog/post-42', $post->body); + } + + public function test_url_absent_leaves_body_clean(): void + { + Http::fake([ + '*' => Http::response([ + 'posts' => [ + $this->lemmyPost( + id: 7, + apId: 'https://lemmy.world/post/7', + body: 'Just a regular post.', + ), + ], + ], 200), + ]); + + $post = (new LemmyClient)->fetchPostsSince($this->lemmyInstance(), null)->first(); + + $this->assertSame('Just a regular post.', $post->body); + } + + public function test_it_handles_posts_without_a_body_key(): void + { + Http::fake([ + '*' => Http::response([ + 'posts' => [ + [ + 'post' => [ + 'id' => 99, + 'ap_id' => 'https://lemmy.world/post/99', + 'url' => null, + 'name' => 'Link-only post', + 'published' => '2026-04-25T10:00:00.000000', + // 'body' key intentionally absent — real Lemmy API omits it for link-only posts + ], + ], + ], + ], 200), + ]); + + $post = (new LemmyClient)->fetchPostsSince($this->lemmyInstance(), null)->first(); + + $this->assertNull($post->body); + } + + public function test_it_hits_the_post_list_endpoint_of_the_instance(): void + { + Http::fake([ + 'lemmy.world/api/v3/post/list*' => Http::response(['posts' => []], 200), + ]); + + (new LemmyClient)->fetchPostsSince($this->lemmyInstance(), null); + + Http::assertSent(fn ($request) => str_starts_with($request->url(), 'https://lemmy.world/api/v3/post/list') + && $request->method() === 'GET' + ); + } + + private function lemmyInstance(): Instance + { + return new Instance([ + 'type' => InstanceType::Lemmy, + 'url' => 'https://lemmy.world', + ]); + } + + /** + * @return array + */ + private function lemmyPost( + int $id, + string $apId, + ?string $url = null, + string $body = '', + string $name = 'A post title', + string $published = '2026-04-25T10:00:00.000000', + ): array { + return [ + 'post' => [ + 'id' => $id, + 'ap_id' => $apId, + 'url' => $url, + 'body' => $body, + 'name' => $name, + 'published' => $published, + ], + ]; + } +} diff --git a/packages/Lvl0/FediDiscover/tests/Feature/MastodonClientTest.php b/packages/Lvl0/FediDiscover/tests/Feature/MastodonClientTest.php index d7a56bf..0516bad 100644 --- a/packages/Lvl0/FediDiscover/tests/Feature/MastodonClientTest.php +++ b/packages/Lvl0/FediDiscover/tests/Feature/MastodonClientTest.php @@ -6,10 +6,10 @@ use Illuminate\Support\Collection; use Illuminate\Support\Facades\Http; -use Lvl0\FediDiscover\Clients\FediversePost; use Lvl0\FediDiscover\Clients\MastodonClient; use Lvl0\FediDiscover\Config\InstanceType; use Lvl0\FediDiscover\Models\Instance; +use Lvl0\FediDiscover\ValueObjects\FediversePost; use Tests\TestCase; class MastodonClientTest extends TestCase @@ -73,6 +73,32 @@ public function test_it_maps_each_status_to_a_fediverse_post(): void $this->assertSame('

Hello

', $posts->first()->body); } + public function test_it_maps_published_at_from_created_at(): void + { + Http::fake([ + '*' => Http::response([ + $this->mastodonStatus(id: '109876543210', url: 'https://mastodon.social/@alice/109876543210'), + ], 200), + ]); + + $posts = (new MastodonClient)->fetchPostsSince($this->mastodonInstance(), null); + + $this->assertSame('2026-04-25T10:00:00Z', $posts->first()->publishedAt); + } + + public function test_it_sets_title_to_null_for_mastodon_statuses(): void + { + Http::fake([ + '*' => Http::response([ + $this->mastodonStatus(id: '109876543210', url: 'https://mastodon.social/@alice/109876543210'), + ], 200), + ]); + + $posts = (new MastodonClient)->fetchPostsSince($this->mastodonInstance(), null); + + $this->assertNull($posts->first()->title); + } + public function test_it_falls_back_to_uri_when_url_is_null(): void { Http::fake([ diff --git a/packages/Lvl0/FediDiscover/tests/Feature/PollFediverseActionTest.php b/packages/Lvl0/FediDiscover/tests/Feature/PollFediverseActionTest.php index 9d52f77..697ee9d 100644 --- a/packages/Lvl0/FediDiscover/tests/Feature/PollFediverseActionTest.php +++ b/packages/Lvl0/FediDiscover/tests/Feature/PollFediverseActionTest.php @@ -7,11 +7,12 @@ use Illuminate\Foundation\Testing\RefreshDatabase; use Illuminate\Support\Facades\Event; use Lvl0\FediDiscover\Actions\PollFediverseAction; +use Lvl0\FediDiscover\Clients\FediverseClientFactory; use Lvl0\FediDiscover\Clients\FediverseClientInterface; -use Lvl0\FediDiscover\Clients\FediversePost; use Lvl0\FediDiscover\Config\InstanceType; use Lvl0\FediDiscover\Events\UrlDiscovered; use Lvl0\FediDiscover\Models\Instance; +use Lvl0\FediDiscover\ValueObjects\FediversePost; use Mockery; use Tests\TestCase; @@ -177,7 +178,10 @@ public function test_it_passes_the_existing_last_seen_id_to_the_client(): void ->with($instance, $instance->last_seen_id) ->andReturn(collect()); - (new PollFediverseAction($client))->execute($instance); + $factory = Mockery::mock(FediverseClientFactory::class); + $factory->shouldReceive('for')->with($instance)->andReturn($client); + + (new PollFediverseAction($factory))->execute($instance); } public function test_it_leaves_last_seen_id_unchanged_when_no_posts_are_returned(): void @@ -205,7 +209,10 @@ private function pollInstance(Instance $instance, array $posts): void $client = Mockery::mock(FediverseClientInterface::class); $client->shouldReceive('fetchPostsSince')->andReturn(collect($posts)); - (new PollFediverseAction($client))->execute($instance); + $factory = Mockery::mock(FediverseClientFactory::class); + $factory->shouldReceive('for')->andReturn($client); + + (new PollFediverseAction($factory))->execute($instance); } /** diff --git a/packages/Lvl0/FediDiscover/tests/Feature/PollInstancesCommandTest.php b/packages/Lvl0/FediDiscover/tests/Feature/PollInstancesCommandTest.php index 31ac841..f1797c7 100644 --- a/packages/Lvl0/FediDiscover/tests/Feature/PollInstancesCommandTest.php +++ b/packages/Lvl0/FediDiscover/tests/Feature/PollInstancesCommandTest.php @@ -6,6 +6,7 @@ use Illuminate\Foundation\Testing\RefreshDatabase; use Lvl0\FediDiscover\Actions\PollFediverseAction; +use Lvl0\FediDiscover\Clients\FediverseClientFactory; use Lvl0\FediDiscover\Clients\FediverseClientInterface; use Lvl0\FediDiscover\Config\InstanceType; use Lvl0\FediDiscover\Models\Instance; @@ -21,12 +22,15 @@ protected function setUp(): void { parent::setUp(); - // Bind a no-op stub so the command can resolve PollFediverseAction + // Bind a no-op factory stub so the command can resolve PollFediverseAction // from the container without making real HTTP calls. - $stub = Mockery::mock(FediverseClientInterface::class); - $stub->shouldReceive('fetchPostsSince')->andReturn(collect()); + $clientStub = Mockery::mock(FediverseClientInterface::class); + $clientStub->shouldReceive('fetchPostsSince')->andReturn(collect()); - $this->app->bind(FediverseClientInterface::class, fn () => $stub); + $factoryStub = Mockery::mock(FediverseClientFactory::class); + $factoryStub->shouldReceive('for')->andReturn($clientStub); + + $this->app->instance(FediverseClientFactory::class, $factoryStub); } public function test_it_exits_zero_when_there_are_no_enabled_instances(): void From 2cb86f333772bcefdd5a8e73689a4d1261e813ac Mon Sep 17 00:00:00 2001 From: myrmidex Date: Sun, 26 Apr 2026 00:46:59 +0200 Subject: [PATCH 14/65] 3 - Schedule fedi-discover:poll every minute --- routes/console.php | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/routes/console.php b/routes/console.php index 3c9adf1..cbb33b7 100644 --- a/routes/console.php +++ b/routes/console.php @@ -1,8 +1,8 @@ comment(Inspiring::quote()); -})->purpose('Display an inspiring quote'); +Schedule::command('fedi-discover:poll') + ->everyMinute() + ->withoutOverlapping(5) + ->runInBackground(); From ec2113710a79aa677ffb42a8caf8d4a35a17516e Mon Sep 17 00:00:00 2001 From: myrmidex Date: Sun, 26 Apr 2026 01:15:35 +0200 Subject: [PATCH 15/65] 3 - Harden fediverse polling: timeouts, error handling, payload fields --- database/factories/PageFactory.php | 24 ------------------- .../src/Actions/PollFediverseAction.php | 17 ++++++++++++- .../src/Clients/FediverseClientInterface.php | 1 + .../FediDiscover/src/Clients/LemmyClient.php | 6 ++--- .../src/Clients/MastodonClient.php | 8 +++---- .../Console/Commands/PollInstancesCommand.php | 8 +++++++ .../FediDiscover/src/Events/UrlDiscovered.php | 18 ++++++-------- .../src/FediDiscoverServiceProvider.php | 6 ----- .../Lvl0/FediDiscover/src/Models/Instance.php | 6 ++++- .../src/ValueObjects/FediversePost.php | 2 +- .../tests/Feature/PollFediverseActionTest.php | 9 +++++-- .../tests/Unit/UrlDiscoveredTest.php | 9 +++++++ 12 files changed, 61 insertions(+), 53 deletions(-) delete mode 100644 database/factories/PageFactory.php diff --git a/database/factories/PageFactory.php b/database/factories/PageFactory.php deleted file mode 100644 index 52302a8..0000000 --- a/database/factories/PageFactory.php +++ /dev/null @@ -1,24 +0,0 @@ - - */ -class PageFactory extends Factory -{ - /** - * Define the model's default state. - * - * @return array - */ - public function definition(): array - { - return [ - // - ]; - } -} diff --git a/packages/Lvl0/FediDiscover/src/Actions/PollFediverseAction.php b/packages/Lvl0/FediDiscover/src/Actions/PollFediverseAction.php index 2c36212..11d8767 100644 --- a/packages/Lvl0/FediDiscover/src/Actions/PollFediverseAction.php +++ b/packages/Lvl0/FediDiscover/src/Actions/PollFediverseAction.php @@ -4,10 +4,13 @@ namespace Lvl0\FediDiscover\Actions; +use Carbon\CarbonImmutable; +use Illuminate\Support\Facades\Log; use Lvl0\FediDiscover\Clients\FediverseClientFactory; use Lvl0\FediDiscover\Events\UrlDiscovered; use Lvl0\FediDiscover\Models\Instance; use Lvl0\FediDiscover\ValueObjects\FediversePost; +use Throwable; class PollFediverseAction { @@ -19,7 +22,17 @@ public function execute(Instance $instance): void $posts = $client->fetchPostsSince($instance, $instance->last_seen_id); $posts->each(function (FediversePost $post) use ($instance) { - $this->processLinks($post, $instance); + try { + $this->processLinks($post, $instance); + } catch (Throwable $e) { + Log::warning('fedi-discover:processLinks failed', [ + 'instance_id' => $instance->id, + 'instance_url' => $instance->url, + 'post_url' => $post->selfUrl, + 'exception' => $e::class, + 'message' => $e->getMessage(), + ]); + } }); if ($posts->isNotEmpty()) { @@ -49,6 +62,8 @@ private function processLinks(FediversePost $post, Instance $instance): void ->unique() ->each(fn (string $url) => UrlDiscovered::dispatch( url: $url, + instanceId: $instance->id, + discoveredAt: CarbonImmutable::now(), postUrl: $post->selfUrl, postBody: $post->body, )); diff --git a/packages/Lvl0/FediDiscover/src/Clients/FediverseClientInterface.php b/packages/Lvl0/FediDiscover/src/Clients/FediverseClientInterface.php index f4a70a6..de74dfa 100644 --- a/packages/Lvl0/FediDiscover/src/Clients/FediverseClientInterface.php +++ b/packages/Lvl0/FediDiscover/src/Clients/FediverseClientInterface.php @@ -6,6 +6,7 @@ use Illuminate\Support\Collection; use Lvl0\FediDiscover\Models\Instance; +use Lvl0\FediDiscover\ValueObjects\FediversePost; interface FediverseClientInterface { diff --git a/packages/Lvl0/FediDiscover/src/Clients/LemmyClient.php b/packages/Lvl0/FediDiscover/src/Clients/LemmyClient.php index 7551b08..792972d 100644 --- a/packages/Lvl0/FediDiscover/src/Clients/LemmyClient.php +++ b/packages/Lvl0/FediDiscover/src/Clients/LemmyClient.php @@ -19,13 +19,13 @@ public function fetchPostsSince(Instance $instance, ?string $lastSeenId): Collec $response = Http::withHeaders([ 'User-Agent' => config('fedi-discover.http.user_agent'), - ])->get($url, $params); + ])->timeout(config('fedi-discover.http.timeout'))->get($url, $params); - if (! $response->successful() || ! is_array($response->json())) { + if (! $response->successful()) { return collect(); } - return collect($response->json()['posts']) + return collect($response->json('posts', [])) ->map(fn (array $p) => $p['post']) ->map(function (array $t) { $parts = array_filter([$t['body'] ?? null, $t['url'] ?? null]); diff --git a/packages/Lvl0/FediDiscover/src/Clients/MastodonClient.php b/packages/Lvl0/FediDiscover/src/Clients/MastodonClient.php index d6e58e5..e2ac205 100644 --- a/packages/Lvl0/FediDiscover/src/Clients/MastodonClient.php +++ b/packages/Lvl0/FediDiscover/src/Clients/MastodonClient.php @@ -19,16 +19,16 @@ public function fetchPostsSince(Instance $instance, ?string $lastSeenId): Collec $response = Http::withHeaders([ 'User-Agent' => config('fedi-discover.http.user_agent'), - ])->get($url, $params); + ])->timeout(config('fedi-discover.http.timeout'))->get($url, $params); - if (! $response->successful() || ! is_array($response->json())) { + if (! $response->successful()) { return collect(); } - return collect($response->json()) + return collect($response->json() ?? []) ->map(fn (array $t) => new FediversePost( cursorId: $t['id'], - selfUrl: $t['url'] ?? $t['uri'], + selfUrl: $t['url'] ?? $t['uri'] ?? null, body: $t['content'], publishedAt: $t['created_at'] ?? null )); diff --git a/packages/Lvl0/FediDiscover/src/Console/Commands/PollInstancesCommand.php b/packages/Lvl0/FediDiscover/src/Console/Commands/PollInstancesCommand.php index 0f8d7d9..41b9604 100644 --- a/packages/Lvl0/FediDiscover/src/Console/Commands/PollInstancesCommand.php +++ b/packages/Lvl0/FediDiscover/src/Console/Commands/PollInstancesCommand.php @@ -7,6 +7,7 @@ use Illuminate\Console\Attributes\Description; use Illuminate\Console\Attributes\Signature; use Illuminate\Console\Command; +use Illuminate\Support\Facades\Log; use Lvl0\FediDiscover\Actions\PollFediverseAction; use Lvl0\FediDiscover\Models\Instance; use Throwable; @@ -31,6 +32,13 @@ public function handle(): int try { $this->action->execute($instance); } catch (Throwable $e) { + $this->error("Failed to poll {$instance->url}: {$e->getMessage()}"); + Log::warning('fedi-discover:poll failed', [ + 'instance_id' => $instance->id, + 'instance_url' => $instance->url, + 'exception' => $e::class, + 'message' => $e->getMessage(), + ]); $hadFailure = true; } }); diff --git a/packages/Lvl0/FediDiscover/src/Events/UrlDiscovered.php b/packages/Lvl0/FediDiscover/src/Events/UrlDiscovered.php index 024cb35..5bcf911 100644 --- a/packages/Lvl0/FediDiscover/src/Events/UrlDiscovered.php +++ b/packages/Lvl0/FediDiscover/src/Events/UrlDiscovered.php @@ -1,26 +1,22 @@ loadMigrationsFrom(__DIR__ . '/../database/migrations'); - Event::listen( - UrlDiscovered::class, - ); - if ($this->app->runningInConsole()) { $this->publishes([ __DIR__ . '/../config/fedi-discover.php' => config_path('fedi-discover.php'), diff --git a/packages/Lvl0/FediDiscover/src/Models/Instance.php b/packages/Lvl0/FediDiscover/src/Models/Instance.php index cabee28..a7211e5 100644 --- a/packages/Lvl0/FediDiscover/src/Models/Instance.php +++ b/packages/Lvl0/FediDiscover/src/Models/Instance.php @@ -40,7 +40,11 @@ class Instance extends Model 'last_polled_at' => 'datetime', ]; - public function scopeEnabled($query): Builder + /** + * @param Builder $query + * @return Builder + */ + public function scopeEnabled(Builder $query): Builder { return $query->where('enabled', true); } diff --git a/packages/Lvl0/FediDiscover/src/ValueObjects/FediversePost.php b/packages/Lvl0/FediDiscover/src/ValueObjects/FediversePost.php index e8d0423..987a84c 100644 --- a/packages/Lvl0/FediDiscover/src/ValueObjects/FediversePost.php +++ b/packages/Lvl0/FediDiscover/src/ValueObjects/FediversePost.php @@ -8,7 +8,7 @@ class FediversePost { public function __construct( public string $cursorId, - public string $selfUrl, + public ?string $selfUrl, public ?string $body = null, public ?string $title = null, public ?string $publishedAt = null, diff --git a/packages/Lvl0/FediDiscover/tests/Feature/PollFediverseActionTest.php b/packages/Lvl0/FediDiscover/tests/Feature/PollFediverseActionTest.php index 697ee9d..231e16c 100644 --- a/packages/Lvl0/FediDiscover/tests/Feature/PollFediverseActionTest.php +++ b/packages/Lvl0/FediDiscover/tests/Feature/PollFediverseActionTest.php @@ -4,6 +4,7 @@ namespace Lvl0\FediDiscover\Tests\Feature; +use Carbon\CarbonImmutable; use Illuminate\Foundation\Testing\RefreshDatabase; use Illuminate\Support\Facades\Event; use Lvl0\FediDiscover\Actions\PollFediverseAction; @@ -119,13 +120,17 @@ public function test_it_passes_post_self_url_and_body_through_to_the_event(): vo { Event::fake([UrlDiscovered::class]); + $instance = $this->makeInstance(); $body = 'Here is https://example.com/article with surrounding context.'; - $this->poll([ + $this->pollInstance($instance, [ new FediversePost('1', 'https://mastodon.social/@alice/1', $body), ]); - Event::assertDispatched(UrlDiscovered::class, fn (UrlDiscovered $e) => $e->postUrl === 'https://mastodon.social/@alice/1' && $e->postBody === $body + Event::assertDispatched(UrlDiscovered::class, fn (UrlDiscovered $e) => $e->postUrl === 'https://mastodon.social/@alice/1' + && $e->postBody === $body + && $e->instanceId === $instance->id + && $e->discoveredAt instanceof CarbonImmutable ); } diff --git a/packages/Lvl0/FediDiscover/tests/Unit/UrlDiscoveredTest.php b/packages/Lvl0/FediDiscover/tests/Unit/UrlDiscoveredTest.php index c493961..a16c795 100644 --- a/packages/Lvl0/FediDiscover/tests/Unit/UrlDiscoveredTest.php +++ b/packages/Lvl0/FediDiscover/tests/Unit/UrlDiscoveredTest.php @@ -4,6 +4,7 @@ namespace Lvl0\FediDiscover\Tests\Unit; +use Carbon\CarbonImmutable; use Lvl0\FediDiscover\Events\UrlDiscovered; use PHPUnit\Framework\TestCase; @@ -11,13 +12,19 @@ class UrlDiscoveredTest extends TestCase { public function test_it_exposes_all_payload_fields(): void { + $discoveredAt = CarbonImmutable::parse('2026-04-26T12:00:00'); + $event = new UrlDiscovered( url: 'https://example.com/article', + instanceId: 42, + discoveredAt: $discoveredAt, postUrl: 'https://mastodon.social/@alice/109876543210', postBody: 'Check out this article: https://example.com/article' ); $this->assertSame('https://example.com/article', $event->url); + $this->assertSame(42, $event->instanceId); + $this->assertTrue($discoveredAt->eq($event->discoveredAt)); $this->assertSame('https://mastodon.social/@alice/109876543210', $event->postUrl); $this->assertSame('Check out this article: https://example.com/article', $event->postBody); } @@ -26,6 +33,8 @@ public function test_post_body_is_nullable(): void { $event = new UrlDiscovered( url: 'https://example.com/article', + instanceId: 1, + discoveredAt: CarbonImmutable::parse('2026-04-26T12:00:00'), postUrl: 'https://mastodon.social/@alice/109876543210', postBody: null ); From bc535c8c0be1d101e89157d464d40e9c21b972f8 Mon Sep 17 00:00:00 2001 From: myrmidex Date: Sun, 26 Apr 2026 02:48:39 +0200 Subject: [PATCH 16/65] 4 - Add pages and page_links migrations with PageStatusEnum --- app/Enums/PageStatusEnum.php | 12 +++++++ .../2026_04_25_234157_create_pages_table.php | 34 +++++++++++++++++++ ...6_04_26_001957_create_page_links_table.php | 27 +++++++++++++++ 3 files changed, 73 insertions(+) create mode 100644 app/Enums/PageStatusEnum.php create mode 100644 database/migrations/2026_04_25_234157_create_pages_table.php create mode 100644 database/migrations/2026_04_26_001957_create_page_links_table.php diff --git a/app/Enums/PageStatusEnum.php b/app/Enums/PageStatusEnum.php new file mode 100644 index 0000000..4f73260 --- /dev/null +++ b/app/Enums/PageStatusEnum.php @@ -0,0 +1,12 @@ +id(); + $table->text('url')->unique(); + $table->string('status')->default(PageStatusEnum::Discovered->value)->index(); + $table->string('title')->nullable(); + $table->foreignId('instance_id') + ->nullable() + ->constrained('fedi_discover_instances') + ->nullOnDelete(); + $table->timestampTz('posted_at')->nullable(); + $table->timestampTz('fetched_at')->nullable(); + $table->timestampTz('failed_at')->nullable(); + $table->timestampsTz(); + }); + } + + public function down(): void + { + Schema::dropIfExists('pages'); + } +}; diff --git a/database/migrations/2026_04_26_001957_create_page_links_table.php b/database/migrations/2026_04_26_001957_create_page_links_table.php new file mode 100644 index 0000000..296b994 --- /dev/null +++ b/database/migrations/2026_04_26_001957_create_page_links_table.php @@ -0,0 +1,27 @@ +id(); + $table->foreignId('source_page_id')->constrained('pages')->cascadeOnDelete(); + $table->foreignId('target_page_id')->constrained('pages')->cascadeOnDelete(); + $table->timestampsTz(); + + $table->unique(['source_page_id', 'target_page_id']); + }); + } + + public function down(): void + { + Schema::dropIfExists('page_links'); + } +}; From 424ad2ff78fd9811ac132910a9777daa7d890e48 Mon Sep 17 00:00:00 2001 From: myrmidex Date: Sun, 26 Apr 2026 02:51:49 +0200 Subject: [PATCH 17/65] 4 - Add Page and PageLink models with factories and unit tests --- app/Models/Page.php | 51 +++++++++++++ app/Models/PageLink.php | 31 ++++++++ database/factories/PageFactory.php | 26 +++++++ database/factories/PageLinkFactory.php | 34 +++++++++ tests/Unit/Models/PageLinkTest.php | 52 ++++++++++++++ tests/Unit/Models/PageTest.php | 99 ++++++++++++++++++++++++++ 6 files changed, 293 insertions(+) create mode 100644 app/Models/Page.php create mode 100644 app/Models/PageLink.php create mode 100644 database/factories/PageFactory.php create mode 100644 database/factories/PageLinkFactory.php create mode 100644 tests/Unit/Models/PageLinkTest.php create mode 100644 tests/Unit/Models/PageTest.php diff --git a/app/Models/Page.php b/app/Models/Page.php new file mode 100644 index 0000000..210de9d --- /dev/null +++ b/app/Models/Page.php @@ -0,0 +1,51 @@ + */ + use HasFactory; + + protected $fillable = [ + 'url', + 'status', + 'title', + 'instance_id', + 'posted_at', + 'fetched_at', + 'failed_at', + ]; + + protected $casts = [ + 'status' => PageStatusEnum::class, + 'posted_at' => 'datetime', + 'fetched_at' => 'datetime', + 'failed_at' => 'datetime', + ]; + + public function instance(): BelongsTo + { + return $this->belongsTo(Instance::class); + } + + public function outgoingLinks(): HasMany + { + return $this->hasMany(PageLink::class, 'source_page_id'); + } + + public function incomingLinks(): HasMany + { + return $this->hasMany(PageLink::class, 'target_page_id'); + } +} diff --git a/app/Models/PageLink.php b/app/Models/PageLink.php new file mode 100644 index 0000000..a8e67f8 --- /dev/null +++ b/app/Models/PageLink.php @@ -0,0 +1,31 @@ + */ + use HasFactory; + + protected $fillable = [ + 'source_page_id', + 'target_page_id', + ]; + + public function sourcePage(): BelongsTo + { + return $this->belongsTo(Page::class, 'source_page_id'); + } + + public function targetPage(): BelongsTo + { + return $this->belongsTo(Page::class, 'target_page_id'); + } +} diff --git a/database/factories/PageFactory.php b/database/factories/PageFactory.php new file mode 100644 index 0000000..55f62ca --- /dev/null +++ b/database/factories/PageFactory.php @@ -0,0 +1,26 @@ + + */ +class PageFactory extends Factory +{ + /** + * @return array + */ + public function definition(): array + { + return [ + 'url' => fake()->url(), + 'status' => PageStatusEnum::Discovered, + ]; + } +} diff --git a/database/factories/PageLinkFactory.php b/database/factories/PageLinkFactory.php new file mode 100644 index 0000000..57a2b6f --- /dev/null +++ b/database/factories/PageLinkFactory.php @@ -0,0 +1,34 @@ + + */ +class PageLinkFactory extends Factory +{ + public function definition(): array + { + return []; + } + + public function withSource(Page $page): static + { + return $this->state(fn () => [ + 'source_page_id' => $page->id, + ]); + } + + public function withTarget(Page $page): static + { + return $this->state(fn () => [ + 'target_page_id' => $page->id, + ]); + } +} diff --git a/tests/Unit/Models/PageLinkTest.php b/tests/Unit/Models/PageLinkTest.php new file mode 100644 index 0000000..f7ffba2 --- /dev/null +++ b/tests/Unit/Models/PageLinkTest.php @@ -0,0 +1,52 @@ +create(['url' => 'https://source.example.com/post/1']); + $target = Page::factory()->create(['url' => 'https://target.example.com/page/2']); + + $link = PageLink::create([ + 'source_page_id' => $source->id, + 'target_page_id' => $target->id, + ]); + + $fresh = $link->fresh(); + + $this->assertNotNull($fresh); + $this->assertSame($source->id, $fresh->source_page_id); + $this->assertSame($target->id, $fresh->target_page_id); + + $this->assertInstanceOf(Page::class, $fresh->sourcePage); + $this->assertSame($source->id, $fresh->sourcePage->id); + + $this->assertInstanceOf(Page::class, $fresh->targetPage); + $this->assertSame($target->id, $fresh->targetPage->id); + } + + public function test_page_link_factory_with_source_and_target_methods_create_a_link(): void + { + $source = Page::factory()->create(['url' => 'https://source.example.com/post/1']); + $target = Page::factory()->create(['url' => 'https://target.example.com/page/2']); + + $link = PageLink::factory() + ->withSource($source) + ->withTarget($target) + ->create(); + + $this->assertSame($source->id, $link->source_page_id); + $this->assertSame($target->id, $link->target_page_id); + } +} diff --git a/tests/Unit/Models/PageTest.php b/tests/Unit/Models/PageTest.php new file mode 100644 index 0000000..02d6f54 --- /dev/null +++ b/tests/Unit/Models/PageTest.php @@ -0,0 +1,99 @@ + 'https://example.com/article', + 'status' => 'discovered', + 'title' => 'An Example Article', + 'instance_id' => null, + 'posted_at' => null, + 'fetched_at' => null, + ]); + + $fresh = $page->fresh(); + + $this->assertNotNull($fresh); + $this->assertSame('https://example.com/article', $fresh->url); + $this->assertSame('An Example Article', $fresh->title); + $this->assertNull($fresh->instance_id); + } + + public function test_page_instance_relationship_returns_the_owning_instance(): void + { + $instance = Instance::factory() + ->type(InstanceType::Mastodon) + ->enabled() + ->create(); + + $page = Page::create([ + 'url' => 'https://example.com/post/1', + 'status' => 'discovered', + 'instance_id' => $instance->id, + ]); + + $fresh = $page->fresh(); + + $this->assertInstanceOf(Instance::class, $fresh->instance); + $this->assertSame($instance->id, $fresh->instance->id); + } + + public function test_page_outgoing_and_incoming_links_relationships(): void + { + $source = Page::factory()->create(['url' => 'https://example.com/source']); + $target = Page::factory()->create(['url' => 'https://example.com/target']); + + PageLink::create([ + 'source_page_id' => $source->id, + 'target_page_id' => $target->id, + ]); + + $freshSource = $source->fresh(); + $freshTarget = $target->fresh(); + + $this->assertCount(1, $freshSource->outgoingLinks); + $this->assertCount(0, $freshSource->incomingLinks); + $this->assertCount(1, $freshTarget->incomingLinks); + $this->assertCount(0, $freshTarget->outgoingLinks); + + $this->assertSame($source->id, $freshTarget->incomingLinks->first()->source_page_id); + $this->assertSame($target->id, $freshSource->outgoingLinks->first()->target_page_id); + } + + public function test_page_status_is_cast_to_enum(): void + { + $cases = [ + ['string' => 'discovered', 'enum' => PageStatusEnum::Discovered], + ['string' => 'fetched', 'enum' => PageStatusEnum::Fetched], + ['string' => 'failed', 'enum' => PageStatusEnum::Failed], + ]; + + foreach ($cases as ['string' => $raw, 'enum' => $expected]) { + $page = Page::create([ + 'url' => 'https://example.com/' . $raw, + 'status' => $raw, + ]); + + $fresh = $page->fresh(); + + $this->assertInstanceOf(PageStatusEnum::class, $fresh->status, "status '{$raw}' should cast to PageStatusEnum"); + $this->assertSame($expected, $fresh->status, "status '{$raw}' should equal PageStatusEnum::{$expected->name}"); + } + } +} From 3ad473f4a107a266c986c68af1f607bd17287f1e Mon Sep 17 00:00:00 2001 From: myrmidex Date: Sun, 26 Apr 2026 03:31:32 +0200 Subject: [PATCH 18/65] 4 - Add UrlDiscoveredListener wiring fediverse polling to pages graph --- app/Listeners/UrlDiscoveredListener.php | 43 ++++++ app/Providers/AppServiceProvider.php | 5 +- ...6_04_26_001957_create_page_links_table.php | 4 +- phpunit.xml | 28 ++-- tests/Feature/UrlDiscoveryTest.php | 140 ++++++++++++++++++ 5 files changed, 203 insertions(+), 17 deletions(-) create mode 100644 app/Listeners/UrlDiscoveredListener.php create mode 100644 tests/Feature/UrlDiscoveryTest.php diff --git a/app/Listeners/UrlDiscoveredListener.php b/app/Listeners/UrlDiscoveredListener.php new file mode 100644 index 0000000..f528017 --- /dev/null +++ b/app/Listeners/UrlDiscoveredListener.php @@ -0,0 +1,43 @@ + $event->url], + ['status' => PageStatusEnum::Discovered, 'instance_id' => $event->instanceId], + ); + + if ($event->postUrl === null) { + return; + } + + $sourcePage = Page::firstOrCreate( + ['url' => $event->postUrl], + [ + 'status' => PageStatusEnum::Fetched, + 'instance_id' => $event->instanceId, + 'fetched_at' => $event->discoveredAt, + ], + ); + + PageLink::firstOrCreate([ + 'source_page_id' => $sourcePage->id, + 'target_page_id' => $targetPage->id, + ]); + }); + } +} diff --git a/app/Providers/AppServiceProvider.php b/app/Providers/AppServiceProvider.php index 452e6b6..5cafe3e 100644 --- a/app/Providers/AppServiceProvider.php +++ b/app/Providers/AppServiceProvider.php @@ -2,7 +2,10 @@ namespace App\Providers; +use App\Listeners\UrlDiscoveredListener; +use Illuminate\Support\Facades\Event; use Illuminate\Support\ServiceProvider; +use Lvl0\FediDiscover\Events\UrlDiscovered; class AppServiceProvider extends ServiceProvider { @@ -19,6 +22,6 @@ public function register(): void */ public function boot(): void { - // + Event::listen(UrlDiscovered::class, UrlDiscoveredListener::class); } } diff --git a/database/migrations/2026_04_26_001957_create_page_links_table.php b/database/migrations/2026_04_26_001957_create_page_links_table.php index 296b994..b67328c 100644 --- a/database/migrations/2026_04_26_001957_create_page_links_table.php +++ b/database/migrations/2026_04_26_001957_create_page_links_table.php @@ -12,8 +12,8 @@ public function up(): void { Schema::create('page_links', function (Blueprint $table) { $table->id(); - $table->foreignId('source_page_id')->constrained('pages')->cascadeOnDelete(); - $table->foreignId('target_page_id')->constrained('pages')->cascadeOnDelete(); + $table->foreignId('source_page_id')->constrained('pages'); + $table->foreignId('target_page_id')->constrained('pages'); $table->timestampsTz(); $table->unique(['source_page_id', 'target_page_id']); diff --git a/phpunit.xml b/phpunit.xml index 9ca208c..ac75c66 100644 --- a/phpunit.xml +++ b/phpunit.xml @@ -27,20 +27,20 @@ - - - - - - - - - - - - - - + + + + + + + + + + + + + + diff --git a/tests/Feature/UrlDiscoveryTest.php b/tests/Feature/UrlDiscoveryTest.php new file mode 100644 index 0000000..014567a --- /dev/null +++ b/tests/Feature/UrlDiscoveryTest.php @@ -0,0 +1,140 @@ +type(InstanceType::Mastodon) + ->enabled() + ->create(); + } + + private function makeEvent(Instance $instance, array $overrides = []): UrlDiscovered + { + return new UrlDiscovered( + url: $overrides['url'] ?? 'https://example-blog.com/article', + instanceId: $overrides['instanceId'] ?? $instance->id, + discoveredAt: $overrides['discoveredAt'] ?? CarbonImmutable::parse('2026-04-26T12:00:00Z'), + postUrl: array_key_exists('postUrl', $overrides) ? $overrides['postUrl'] : 'https://mastodon.social/@alice/109876543210', + postBody: array_key_exists('postBody', $overrides) ? $overrides['postBody'] : 'check this out https://example-blog.com/article', + ); + } + + // --------------------------------------------------------------------------- + // Test 9 — happy path + // --------------------------------------------------------------------------- + + public function test_listener_creates_target_page_and_source_page_with_link(): void + { + $instance = $this->makeInstance(); + $discoveredAt = CarbonImmutable::parse('2026-04-26T12:00:00Z'); + + $event = new UrlDiscovered( + url: 'https://example-blog.com/article', + instanceId: $instance->id, + discoveredAt: $discoveredAt, + postUrl: 'https://mastodon.social/@alice/109876543210', + postBody: 'check this out https://example-blog.com/article', + ); + + event($event); + + // Target page + $targetPage = Page::where('url', 'https://example-blog.com/article')->first(); + $this->assertNotNull($targetPage); + $this->assertSame(PageStatusEnum::Discovered, $targetPage->status); + $this->assertSame($instance->id, $targetPage->instance_id); + + // Source page + $sourcePage = Page::where('url', 'https://mastodon.social/@alice/109876543210')->first(); + $this->assertNotNull($sourcePage); + $this->assertSame(PageStatusEnum::Fetched, $sourcePage->status); + $this->assertSame($instance->id, $sourcePage->instance_id); + $this->assertNotNull($sourcePage->fetched_at); + $this->assertTrue($discoveredAt->equalTo($sourcePage->fetched_at)); + + // Edge + $link = PageLink::where('source_page_id', $sourcePage->id) + ->where('target_page_id', $targetPage->id) + ->first(); + $this->assertNotNull($link); + } + + // --------------------------------------------------------------------------- + // Test 10 — idempotency + // --------------------------------------------------------------------------- + + public function test_listener_is_idempotent_on_repeated_event(): void + { + $instance = $this->makeInstance(); + $event = $this->makeEvent($instance); + + event($event); + event($event); + + $this->assertSame(2, Page::count()); + $this->assertSame(1, PageLink::count()); + } + + // --------------------------------------------------------------------------- + // Test 11 — null postUrl: only target page, no edge + // --------------------------------------------------------------------------- + + public function test_listener_with_null_post_url_creates_only_target_page(): void + { + $instance = $this->makeInstance(); + $event = $this->makeEvent($instance, ['postUrl' => null, 'postBody' => null]); + + event($event); + + $this->assertSame(1, Page::count()); + $this->assertSame(0, PageLink::count()); + + $targetPage = Page::where('url', 'https://example-blog.com/article')->first(); + $this->assertNotNull($targetPage); + $this->assertSame(PageStatusEnum::Discovered, $targetPage->status); + $this->assertSame($instance->id, $targetPage->instance_id); + } + + // --------------------------------------------------------------------------- + // Test 12 — listener is queued, not run inline + // --------------------------------------------------------------------------- + + public function test_listener_is_pushed_to_queue_not_run_inline(): void + { + Queue::fake(); + + $instance = $this->makeInstance(); + $event = $this->makeEvent($instance); + + event($event); + + Queue::assertPushed(CallQueuedListener::class, function (CallQueuedListener $job): bool { + return $job->class === UrlDiscoveredListener::class; + }); + } +} From 6b610b699eee191b0e3bfe529a1c51db8fde6970 Mon Sep 17 00:00:00 2001 From: myrmidex Date: Sun, 26 Apr 2026 03:52:12 +0200 Subject: [PATCH 19/65] 4 - Drop status promotion in UrlDiscoveredListener; defer to keywords listener --- app/Listeners/UrlDiscoveredListener.php | 8 ++------ tests/Feature/UrlDiscoveryTest.php | 5 ++--- 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/app/Listeners/UrlDiscoveredListener.php b/app/Listeners/UrlDiscoveredListener.php index f528017..535951a 100644 --- a/app/Listeners/UrlDiscoveredListener.php +++ b/app/Listeners/UrlDiscoveredListener.php @@ -21,17 +21,13 @@ public function handle(UrlDiscovered $event): void ['status' => PageStatusEnum::Discovered, 'instance_id' => $event->instanceId], ); - if ($event->postUrl === null) { + if ($event->postUrl === null || $event->postUrl === $event->url) { return; } $sourcePage = Page::firstOrCreate( ['url' => $event->postUrl], - [ - 'status' => PageStatusEnum::Fetched, - 'instance_id' => $event->instanceId, - 'fetched_at' => $event->discoveredAt, - ], + ['status' => PageStatusEnum::Discovered, 'instance_id' => $event->instanceId], ); PageLink::firstOrCreate([ diff --git a/tests/Feature/UrlDiscoveryTest.php b/tests/Feature/UrlDiscoveryTest.php index 014567a..ff36ed2 100644 --- a/tests/Feature/UrlDiscoveryTest.php +++ b/tests/Feature/UrlDiscoveryTest.php @@ -72,10 +72,9 @@ public function test_listener_creates_target_page_and_source_page_with_link(): v // Source page $sourcePage = Page::where('url', 'https://mastodon.social/@alice/109876543210')->first(); $this->assertNotNull($sourcePage); - $this->assertSame(PageStatusEnum::Fetched, $sourcePage->status); + $this->assertSame(PageStatusEnum::Discovered, $sourcePage->status); $this->assertSame($instance->id, $sourcePage->instance_id); - $this->assertNotNull($sourcePage->fetched_at); - $this->assertTrue($discoveredAt->equalTo($sourcePage->fetched_at)); + $this->assertNull($sourcePage->fetched_at); // Edge $link = PageLink::where('source_page_id', $sourcePage->id) From 0f14c66c343c957536098c2bf7b492374bbe5c66 Mon Sep 17 00:00:00 2001 From: myrmidex Date: Sun, 26 Apr 2026 11:54:23 +0200 Subject: [PATCH 20/65] 5 - Publish Livewire config with class-based components and no emoji --- config/livewire.php | 282 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 282 insertions(+) create mode 100644 config/livewire.php diff --git a/config/livewire.php b/config/livewire.php new file mode 100644 index 0000000..350f585 --- /dev/null +++ b/config/livewire.php @@ -0,0 +1,282 @@ + [ + resource_path('views/components'), + resource_path('views/livewire'), + ], + + /* + |--------------------------------------------------------------------------- + | Component Namespaces + |--------------------------------------------------------------------------- + | + | This value sets default namespaces that will be used to resolve view-based + | components like single-file and multi-file components. These folders'll + | also be referenced when creating new components via the make command. + | + */ + + 'component_namespaces' => [ + 'layouts' => resource_path('views/layouts'), + 'pages' => resource_path('views/pages'), + ], + + /* + |--------------------------------------------------------------------------- + | Page Layout + |--------------------------------------------------------------------------- + | The view that will be used as the layout when rendering a single component as + | an entire page via `Route::livewire('/post/create', 'pages::create-post')`. + | In this case, the content of pages::create-post will render into $slot. + | + */ + + 'component_layout' => 'layouts::app', + + /* + |--------------------------------------------------------------------------- + | Lazy Loading Placeholder + |--------------------------------------------------------------------------- + | Livewire allows you to lazy load components that would otherwise slow down + | the initial page load. Every component can have a custom placeholder or + | you can define the default placeholder view for all components below. + | + */ + + 'component_placeholder' => null, // Example: 'placeholders::skeleton' + + /* + |--------------------------------------------------------------------------- + | Make Command + |--------------------------------------------------------------------------- + | This value determines the default configuration for the artisan make command + | You can configure the component type (sfc, mfc, class) and whether to use + | the high-voltage (⚡) emoji as a prefix in the sfc|mfc component names. + | + */ + + 'make_command' => [ + 'type' => 'class', // Options: 'sfc', 'mfc', 'class' + 'emoji' => false, // Options: true, false + 'with' => [ + 'js' => false, + 'css' => false, + 'test' => false, + ], + ], + + /* + |--------------------------------------------------------------------------- + | Class Namespace + |--------------------------------------------------------------------------- + | + | This value sets the root class namespace for Livewire component classes in + | your application. This value will change where component auto-discovery + | finds components. It's also referenced by the file creation commands. + | + */ + + 'class_namespace' => 'App\\Livewire', + + /* + |--------------------------------------------------------------------------- + | Class Path + |--------------------------------------------------------------------------- + | + | This value is used to specify the path where Livewire component class files + | are created when running creation commands like `artisan make:livewire`. + | This path is customizable to match your projects directory structure. + | + */ + + 'class_path' => app_path('Livewire'), + + /* + |--------------------------------------------------------------------------- + | View Path + |--------------------------------------------------------------------------- + | + | This value is used to specify where Livewire component Blade templates are + | stored when running file creation commands like `artisan make:livewire`. + | It is also used if you choose to omit a component's render() method. + | + */ + + 'view_path' => resource_path('views/livewire'), + + /* + |--------------------------------------------------------------------------- + | Temporary File Uploads + |--------------------------------------------------------------------------- + | + | Livewire handles file uploads by storing uploads in a temporary directory + | before the file is stored permanently. All file uploads are directed to + | a global endpoint for temporary storage. You may configure this below: + | + */ + + 'temporary_file_upload' => [ + 'disk' => env('LIVEWIRE_TEMPORARY_FILE_UPLOAD_DISK'), // Example: 'local', 's3' | Default: 'default' + 'rules' => null, // Example: ['file', 'mimes:png,jpg'] | Default: ['required', 'file', 'max:12288'] (12MB) + 'directory' => null, // Example: 'tmp' | Default: 'livewire-tmp' + 'middleware' => null, // Example: 'throttle:5,1' | Default: 'throttle:60,1' + 'preview_mimes' => [ // Supported file types for temporary pre-signed file URLs... + 'png', 'gif', 'bmp', 'svg', 'wav', 'mp4', + 'mov', 'avi', 'wmv', 'mp3', 'm4a', + 'jpg', 'jpeg', 'mpga', 'webp', 'wma', + ], + 'max_upload_time' => 5, // Max duration (in minutes) before an upload is invalidated... + 'cleanup' => true, // Should cleanup temporary uploads older than 24 hrs... + ], + + /* + |--------------------------------------------------------------------------- + | Render On Redirect + |--------------------------------------------------------------------------- + | + | This value determines if Livewire will run a component's `render()` method + | after a redirect has been triggered using something like `redirect(...)` + | Setting this to true will render the view once more before redirecting + | + */ + + 'render_on_redirect' => false, + + /* + |--------------------------------------------------------------------------- + | Eloquent Model Binding + |--------------------------------------------------------------------------- + | + | Previous versions of Livewire supported binding directly to eloquent model + | properties using wire:model by default. However, this behavior has been + | deemed too "magical" and has therefore been put under a feature flag. + | + */ + + 'legacy_model_binding' => false, + + /* + |--------------------------------------------------------------------------- + | Auto-inject Frontend Assets + |--------------------------------------------------------------------------- + | + | By default, Livewire automatically injects its JavaScript and CSS into the + | and of pages containing Livewire components. By disabling + | this behavior, you need to use @livewireStyles and @livewireScripts. + | + */ + + 'inject_assets' => true, + + /* + |--------------------------------------------------------------------------- + | Navigate (SPA mode) + |--------------------------------------------------------------------------- + | + | By adding `wire:navigate` to links in your Livewire application, Livewire + | will prevent the default link handling and instead request those pages + | via AJAX, creating an SPA-like effect. Configure this behavior here. + | + */ + + 'navigate' => [ + 'show_progress_bar' => true, + 'progress_bar_color' => '#2299dd', + ], + + /* + |--------------------------------------------------------------------------- + | HTML Morph Markers + |--------------------------------------------------------------------------- + | + | Livewire intelligently "morphs" existing HTML into the newly rendered HTML + | after each update. To make this process more reliable, Livewire injects + | "markers" into the rendered Blade surrounding @if, @class & @foreach. + | + */ + + 'inject_morph_markers' => true, + + /* + |--------------------------------------------------------------------------- + | Smart Wire Keys + |--------------------------------------------------------------------------- + | + | Livewire uses loops and keys used within loops to generate smart keys that + | are applied to nested components that don't have them. This makes using + | nested components more reliable by ensuring that they all have keys. + | + */ + + 'smart_wire_keys' => true, + + /* + |--------------------------------------------------------------------------- + | Pagination Theme + |--------------------------------------------------------------------------- + | + | When enabling Livewire's pagination feature by using the `WithPagination` + | trait, Livewire will use Tailwind templates to render pagination views + | on the page. If you want Bootstrap CSS, you can specify: "bootstrap" + | + */ + + 'pagination_theme' => 'tailwind', + + /* + |--------------------------------------------------------------------------- + | Release Token + |--------------------------------------------------------------------------- + | + | This token is stored client-side and sent along with each request to check + | a users session to see if a new release has invalidated it. If there is + | a mismatch it will throw an error and prompt for a browser refresh. + | + */ + + 'release_token' => 'a', + + /* + |--------------------------------------------------------------------------- + | CSP Safe + |--------------------------------------------------------------------------- + | + | This config is used to determine if Livewire will use the CSP-safe version + | of Alpine in its bundle. This is useful for applications that are using + | strict Content Security Policy (CSP) to protect against XSS attacks. + | + */ + + 'csp_safe' => false, + + /* + |--------------------------------------------------------------------------- + | Payload Guards + |--------------------------------------------------------------------------- + | + | These settings protect against malicious or oversized payloads that could + | cause denial of service. The default values should feel reasonable for + | most web applications. Each can be set to null to disable the limit. + | + */ + + 'payload' => [ + 'max_size' => 1024 * 1024, // 1MB - maximum request payload size in bytes + 'max_nesting_depth' => 10, // Maximum depth of dot-notation property paths + 'max_calls' => 50, // Maximum method calls per request + 'max_components' => 20, // Maximum components per batch request + ], +]; From b0a4102637ec53930ea74b88803c58e9ac96caaf Mon Sep 17 00:00:00 2001 From: myrmidex Date: Sun, 26 Apr 2026 11:56:39 +0200 Subject: [PATCH 21/65] 5 - Trust forwarded headers behind reverse proxy for real client IP --- bootstrap/app.php | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/bootstrap/app.php b/bootstrap/app.php index 0be149f..27faf69 100644 --- a/bootstrap/app.php +++ b/bootstrap/app.php @@ -3,6 +3,7 @@ use Illuminate\Foundation\Application; use Illuminate\Foundation\Configuration\Exceptions; use Illuminate\Foundation\Configuration\Middleware; +use Illuminate\Http\Request; return Application::configure(basePath: dirname(__DIR__)) ->withRouting( @@ -11,7 +12,11 @@ health: '/up', ) ->withMiddleware(function (Middleware $middleware): void { - // + $middleware->trustProxies( + at: '*', + headers: Request::HEADER_X_FORWARDED_FOR + | Request::HEADER_X_FORWARDED_PROTO, + ); }) ->withExceptions(function (Exceptions $exceptions): void { // From 43837a99db8d3433808f86ccb55cd54f70a3f1e9 Mon Sep 17 00:00:00 2001 From: myrmidex Date: Sun, 26 Apr 2026 11:58:51 +0200 Subject: [PATCH 22/65] 5 - Add UrlSubmissionForm Livewire component with rate limiting --- app/Livewire/UrlSubmissionForm.php | 48 ++++++ resources/views/components/layout.blade.php | 18 +++ .../livewire/url-submission-form.blade.php | 14 ++ resources/views/urls/submit.blade.php | 3 + routes/web.php | 4 + tests/Feature/UrlSubmissionTest.php | 145 ++++++++++++++++++ 6 files changed, 232 insertions(+) create mode 100644 app/Livewire/UrlSubmissionForm.php create mode 100644 resources/views/components/layout.blade.php create mode 100644 resources/views/livewire/url-submission-form.blade.php create mode 100644 resources/views/urls/submit.blade.php create mode 100644 tests/Feature/UrlSubmissionTest.php diff --git a/app/Livewire/UrlSubmissionForm.php b/app/Livewire/UrlSubmissionForm.php new file mode 100644 index 0000000..bbf2000 --- /dev/null +++ b/app/Livewire/UrlSubmissionForm.php @@ -0,0 +1,48 @@ +ip(); + + if (RateLimiter::tooManyAttempts($key, 10)) { + $this->addError('rate_limit', 'Too many submissions, try again shortly.'); + + return; + } + + RateLimiter::hit($key, 60); + + $validated = $this->validate([ + 'url' => ['required', 'url:http,https'], + ]); + + Page::firstOrCreate( + ['url' => $validated['url']], + ['status' => PageStatusEnum::Discovered], + ); + + $this->confirmedUrl = $validated['url']; + $this->reset('url'); + } + + public function render(): View + { + return view('livewire.url-submission-form'); + } +} diff --git a/resources/views/components/layout.blade.php b/resources/views/components/layout.blade.php new file mode 100644 index 0000000..5100c31 --- /dev/null +++ b/resources/views/components/layout.blade.php @@ -0,0 +1,18 @@ + + + + + + + {{ $title ?? config('app.name') }} + + @vite(['resources/css/app.css', 'resources/js/app.js']) + + @livewireStyles + + + {{ $slot }} + + @livewireScripts + + diff --git a/resources/views/livewire/url-submission-form.blade.php b/resources/views/livewire/url-submission-form.blade.php new file mode 100644 index 0000000..49da751 --- /dev/null +++ b/resources/views/livewire/url-submission-form.blade.php @@ -0,0 +1,14 @@ +
+ @error('rate_limit')

{{ $message }}

@enderror + + @if ($confirmedUrl !== null) +

Thanks, we've received {{ $confirmedUrl }}

+ @else +
+ + + @error('url')

{{ $message }}

@enderror + +
+ @endif +
diff --git a/resources/views/urls/submit.blade.php b/resources/views/urls/submit.blade.php new file mode 100644 index 0000000..266ab36 --- /dev/null +++ b/resources/views/urls/submit.blade.php @@ -0,0 +1,3 @@ + + + diff --git a/routes/web.php b/routes/web.php index 86a06c5..5f96afc 100644 --- a/routes/web.php +++ b/routes/web.php @@ -1,7 +1,11 @@ get('/submit'); + + $response->assertStatus(200); + $response->assertSeeLivewire('url-submission-form'); + } + + // ------------------------------------------------------------------------- + // Test 2 — valid submission creates a page row as Discovered + // ------------------------------------------------------------------------- + + public function test_valid_url_submission_creates_page_as_discovered(): void + { + Livewire::test(UrlSubmissionForm::class) + ->set('url', 'https://example.com/interesting-post') + ->call('submit') + ->assertHasNoErrors(); + + $this->assertDatabaseHas('pages', [ + 'url' => 'https://example.com/interesting-post', + 'status' => PageStatusEnum::Discovered, + 'instance_id' => null, + ]); + } + + // ------------------------------------------------------------------------- + // Test 3 — duplicate submission is idempotent (no second row created) + // ------------------------------------------------------------------------- + + public function test_duplicate_url_submission_does_not_create_second_page(): void + { + $url = 'https://example.com/seen-before'; + + Page::factory()->create([ + 'url' => $url, + 'status' => PageStatusEnum::Discovered, + ]); + + Livewire::test(UrlSubmissionForm::class) + ->set('url', $url) + ->call('submit') + ->assertHasNoErrors(); + + $this->assertDatabaseCount('pages', 1); + } + + // ------------------------------------------------------------------------- + // Test 4 — confirmation state echoes submitted URL + // ------------------------------------------------------------------------- + + public function test_confirmation_state_echoes_submitted_url(): void + { + $url = 'https://example.com/great-article'; + + Livewire::test(UrlSubmissionForm::class) + ->set('url', $url) + ->call('submit') + ->assertHasNoErrors() + ->assertSet('confirmedUrl', $url) + ->assertSet('url', '') + ->assertSee($url); + } + + // ------------------------------------------------------------------------- + // Test 5 — empty URL fails validation (regression lock) + // ------------------------------------------------------------------------- + + public function test_missing_url_fails_validation(): void + { + Livewire::test(UrlSubmissionForm::class) + ->set('url', '') + ->call('submit') + ->assertHasErrors(['url' => 'required']); + } + + // ------------------------------------------------------------------------- + // Test 6 — invalid URL formats fail validation + // ------------------------------------------------------------------------- + + #[DataProvider('invalidUrls')] + public function test_invalid_url_formats_fail_validation(string $url): void + { + Livewire::test(UrlSubmissionForm::class) + ->set('url', $url) + ->call('submit') + ->assertHasErrors('url'); + } + + public static function invalidUrls(): array + { + return [ + 'no scheme' => ['not-a-url'], + 'disallowed scheme' => ['ftp://example.com'], + 'javascript scheme' => ['javascript:alert(1)'], + ]; + } + + // ------------------------------------------------------------------------- + // Test 7 — rate limit blocks the 11th submission within a minute + // ------------------------------------------------------------------------- + + public function test_rate_limit_blocks_eleventh_submission_within_a_minute(): void + { + // 10 submissions within the limit — each must succeed + for ($i = 1; $i <= 10; $i++) { + Livewire::test(UrlSubmissionForm::class) + ->set('url', "https://example.com/post-{$i}") + ->call('submit') + ->assertHasNoErrors(); + } + + // 11th submission from the same IP must be blocked, with the message visible + Livewire::test(UrlSubmissionForm::class) + ->set('url', 'https://example.com/post-11') + ->call('submit') + ->assertHasErrors('rate_limit') + ->assertSee('Too many submissions'); + + // The 11th URL must NOT have been persisted + $this->assertDatabaseCount('pages', 10); + } +} From b1b7adeacd8e31fef92b00c1d17ef691b7063f2f Mon Sep 17 00:00:00 2001 From: myrmidex Date: Sun, 26 Apr 2026 12:53:21 +0200 Subject: [PATCH 23/65] 7 - Add language column to pages for crawler-detected language --- app/Models/Page.php | 1 + .../2026_04_25_234157_create_pages_table.php | 1 + tests/Unit/Models/PageTest.php | 21 +++++++++++++++++++ 3 files changed, 23 insertions(+) diff --git a/app/Models/Page.php b/app/Models/Page.php index 210de9d..ab31e45 100644 --- a/app/Models/Page.php +++ b/app/Models/Page.php @@ -20,6 +20,7 @@ class Page extends Model protected $fillable = [ 'url', 'status', + 'language', 'title', 'instance_id', 'posted_at', diff --git a/database/migrations/2026_04_25_234157_create_pages_table.php b/database/migrations/2026_04_25_234157_create_pages_table.php index 541d384..e1df51f 100644 --- a/database/migrations/2026_04_25_234157_create_pages_table.php +++ b/database/migrations/2026_04_25_234157_create_pages_table.php @@ -15,6 +15,7 @@ public function up(): void $table->id(); $table->text('url')->unique(); $table->string('status')->default(PageStatusEnum::Discovered->value)->index(); + $table->string('language', 35)->nullable()->index(); $table->string('title')->nullable(); $table->foreignId('instance_id') ->nullable() diff --git a/tests/Unit/Models/PageTest.php b/tests/Unit/Models/PageTest.php index 02d6f54..8319510 100644 --- a/tests/Unit/Models/PageTest.php +++ b/tests/Unit/Models/PageTest.php @@ -76,6 +76,27 @@ public function test_page_outgoing_and_incoming_links_relationships(): void $this->assertSame($target->id, $freshSource->outgoingLinks->first()->target_page_id); } + public function test_page_language_is_fillable_and_persists(): void + { + $page = Page::create([ + 'url' => 'https://example.com/crawled', + 'status' => 'discovered', + 'language' => 'en', + ]); + + $fresh = $page->fresh(); + + $this->assertNotNull($fresh); + $this->assertSame('en', $fresh->language); + + $unset = Page::create([ + 'url' => 'https://example.com/no-language', + 'status' => 'discovered', + ]); + + $this->assertNull($unset->fresh()->language); + } + public function test_page_status_is_cast_to_enum(): void { $cases = [ From 9dd6d84d653efb44917c8402d25a1c8250f2c54e Mon Sep 17 00:00:00 2001 From: myrmidex Date: Sun, 26 Apr 2026 13:06:22 +0200 Subject: [PATCH 24/65] 7 - Add CrawlOutcomeEnum for crawl attempt outcomes --- app/Enums/CrawlOutcomeEnum.php | 15 ++++++++++ tests/Unit/Enums/CrawlOutcomeEnumTest.php | 35 +++++++++++++++++++++++ 2 files changed, 50 insertions(+) create mode 100644 app/Enums/CrawlOutcomeEnum.php create mode 100644 tests/Unit/Enums/CrawlOutcomeEnumTest.php diff --git a/app/Enums/CrawlOutcomeEnum.php b/app/Enums/CrawlOutcomeEnum.php new file mode 100644 index 0000000..e7f16dd --- /dev/null +++ b/app/Enums/CrawlOutcomeEnum.php @@ -0,0 +1,15 @@ + 'success', + 'Failed' => 'failed', + 'Timeout' => 'timeout', + 'BlockedRobots' => 'blocked_robots', + 'Blocked4xx' => 'blocked_4xx', + 'Blocked5xx' => 'blocked_5xx', + ]; + + foreach ($expected as $caseName => $backingValue) { + $case = CrawlOutcomeEnum::from($backingValue); + + $this->assertSame($caseName, $case->name, "Case name for '{$backingValue}' should be '{$caseName}'"); + $this->assertSame($backingValue, $case->value, "Backing value for '{$caseName}' should be '{$backingValue}'"); + } + } + + public function test_enum_has_exactly_six_cases(): void + { + $this->assertCount(6, CrawlOutcomeEnum::cases()); + } +} From fe8ca7fc10397eeae99e3996c8ade035e7bb2935 Mon Sep 17 00:00:00 2001 From: myrmidex Date: Sun, 26 Apr 2026 14:15:49 +0200 Subject: [PATCH 25/65] 7 - Add page_crawls migration, PageCrawl model, factory, and Page relationships --- app/Models/Page.php | 11 +++ app/Models/PageCrawl.php | 41 ++++++++++ database/factories/PageCrawlFactory.php | 71 ++++++++++++++++ ..._04_26_111140_create_page_crawls_table.php | 47 +++++++++++ tests/Unit/Models/PageCrawlFactoryTest.php | 56 +++++++++++++ tests/Unit/Models/PageCrawlTest.php | 82 +++++++++++++++++++ tests/Unit/Models/PageTest.php | 43 ++++++++++ 7 files changed, 351 insertions(+) create mode 100644 app/Models/PageCrawl.php create mode 100644 database/factories/PageCrawlFactory.php create mode 100644 database/migrations/2026_04_26_111140_create_page_crawls_table.php create mode 100644 tests/Unit/Models/PageCrawlFactoryTest.php create mode 100644 tests/Unit/Models/PageCrawlTest.php diff --git a/app/Models/Page.php b/app/Models/Page.php index ab31e45..60ce74d 100644 --- a/app/Models/Page.php +++ b/app/Models/Page.php @@ -10,6 +10,7 @@ use Illuminate\Database\Eloquent\Model; use Illuminate\Database\Eloquent\Relations\BelongsTo; use Illuminate\Database\Eloquent\Relations\HasMany; +use Illuminate\Database\Eloquent\Relations\HasOne; use Lvl0\FediDiscover\Models\Instance; class Page extends Model @@ -49,4 +50,14 @@ public function incomingLinks(): HasMany { return $this->hasMany(PageLink::class, 'target_page_id'); } + + public function crawls(): HasMany + { + return $this->hasMany(PageCrawl::class); + } + + public function latestCrawl(): HasOne + { + return $this->hasOne(PageCrawl::class)->latestOfMany('created_at'); + } } diff --git a/app/Models/PageCrawl.php b/app/Models/PageCrawl.php new file mode 100644 index 0000000..a615e77 --- /dev/null +++ b/app/Models/PageCrawl.php @@ -0,0 +1,41 @@ + */ + use HasFactory; + + protected $fillable = [ + 'page_id', + 'domain', + 'priority', + 'scheduled_for', + 'completed_at', + 'outcome', + 'status_code', + 'error_message', + 'locked_at', + ]; + + protected $casts = [ + 'scheduled_for' => 'datetime', + 'completed_at' => 'datetime', + 'outcome' => CrawlOutcomeEnum::class, + 'locked_at' => 'datetime', + ]; + + public function page(): BelongsTo + { + return $this->belongsTo(Page::class); + } +} diff --git a/database/factories/PageCrawlFactory.php b/database/factories/PageCrawlFactory.php new file mode 100644 index 0000000..80c6f7c --- /dev/null +++ b/database/factories/PageCrawlFactory.php @@ -0,0 +1,71 @@ + + */ +class PageCrawlFactory extends Factory +{ + public function definition(): array + { + return [ + 'page_id' => null, + 'domain' => 'example.com', + 'priority' => 0, + 'scheduled_for' => now(), + 'completed_at' => null, + 'outcome' => null, + 'status_code' => null, + 'error_message' => null, + 'locked_at' => null, + ]; + } + + public function page(Page $page): static + { + return $this->state(fn () => [ + 'page_id' => $page->id, + ]); + } + + public function successful(): static + { + return $this->state(fn () => [ + 'outcome' => CrawlOutcomeEnum::Success, + 'completed_at' => now(), + ]); + } + + public function failed(string $errorMessage): static + { + return $this->state(fn () => [ + 'outcome' => CrawlOutcomeEnum::Failed, + 'completed_at' => now(), + 'error_message' => $errorMessage, + ]); + } + + public function scheduledAt(Carbon $scheduledAt): static + { + return $this->state(fn () => [ + 'scheduled_for' => $scheduledAt, + ]); + } + + public function locked(): static + { + return $this->state(fn () => [ + 'locked_at' => now(), + 'outcome' => null, + ]); + } +} diff --git a/database/migrations/2026_04_26_111140_create_page_crawls_table.php b/database/migrations/2026_04_26_111140_create_page_crawls_table.php new file mode 100644 index 0000000..b423f25 --- /dev/null +++ b/database/migrations/2026_04_26_111140_create_page_crawls_table.php @@ -0,0 +1,47 @@ +id(); + $table->foreignId('page_id') + ->constrained('pages') + ->cascadeOnDelete(); + $table->string('domain'); + $table->smallInteger('priority')->default(0); + $table->timestampTz('scheduled_for')->useCurrent(); + $table->timestampTz('locked_at')->nullable(); + $table->timestampTz('completed_at')->nullable(); + $table->string('outcome')->nullable(); + $table->smallInteger('status_code')->nullable(); + $table->text('error_message')->nullable(); + $table->timestampsTz(); + + $table->index(['page_id', 'created_at']); + }); + + if (DB::getDriverName() === 'pgsql') { + DB::statement('CREATE INDEX page_crawls_pending_domain_idx ON page_crawls (domain) WHERE outcome IS NULL'); + DB::statement('CREATE INDEX page_crawls_pending_poll_idx ON page_crawls (scheduled_for, locked_at) WHERE outcome IS NULL'); + } else { + Schema::table('page_crawls', function (Blueprint $table) { + $table->index('domain'); + $table->index(['scheduled_for', 'locked_at']); + }); + } + } + + public function down(): void + { + Schema::dropIfExists('page_crawls'); + } +}; diff --git a/tests/Unit/Models/PageCrawlFactoryTest.php b/tests/Unit/Models/PageCrawlFactoryTest.php new file mode 100644 index 0000000..65d29cc --- /dev/null +++ b/tests/Unit/Models/PageCrawlFactoryTest.php @@ -0,0 +1,56 @@ +create(); + $crawl = PageCrawl::factory()->page($page)->successful()->create(); + + $this->assertSame(CrawlOutcomeEnum::Success, $crawl->outcome); + $this->assertInstanceOf(Carbon::class, $crawl->completed_at); + $this->assertNull($crawl->error_message); + } + + public function test_factory_failed_state_produces_failed_outcome_with_message(): void + { + $page = Page::factory()->create(); + $crawl = PageCrawl::factory()->page($page)->failed('Connection timed out')->create(); + + $this->assertSame(CrawlOutcomeEnum::Failed, $crawl->outcome); + $this->assertInstanceOf(Carbon::class, $crawl->completed_at); + $this->assertSame('Connection timed out', $crawl->error_message); + } + + public function test_factory_locked_state_produces_in_flight_crawl(): void + { + $page = Page::factory()->create(); + $crawl = PageCrawl::factory()->page($page)->locked()->create(); + + $this->assertInstanceOf(Carbon::class, $crawl->locked_at); + $this->assertNull($crawl->completed_at); + $this->assertNull($crawl->outcome); + } + + public function test_factory_scheduled_at_state_overrides_default_scheduled_for(): void + { + $page = Page::factory()->create(); + $timestamp = Carbon::parse('2026-05-01 10:00:00'); + $crawl = PageCrawl::factory()->page($page)->scheduledAt($timestamp)->create(); + + $this->assertTrue($timestamp->equalTo($crawl->scheduled_for)); + } +} diff --git a/tests/Unit/Models/PageCrawlTest.php b/tests/Unit/Models/PageCrawlTest.php new file mode 100644 index 0000000..107e542 --- /dev/null +++ b/tests/Unit/Models/PageCrawlTest.php @@ -0,0 +1,82 @@ +create(['url' => 'https://example.com/page-1']); + + $scheduledFor = Carbon::parse('2026-05-01 10:00:00'); + $lockedAt = Carbon::parse('2026-05-01 10:01:00'); + $completedAt = Carbon::parse('2026-05-01 10:01:05'); + + $crawl = PageCrawl::create([ + 'page_id' => $page->id, + 'domain' => 'example.com', + 'priority' => 5, + 'scheduled_for' => $scheduledFor, + 'locked_at' => $lockedAt, + 'completed_at' => $completedAt, + 'outcome' => CrawlOutcomeEnum::Success, + 'status_code' => 200, + 'error_message' => null, + ]); + + $fresh = $crawl->fresh(); + + $this->assertNotNull($fresh); + + // domain / priority round-trip + $this->assertSame('example.com', $fresh->domain); + $this->assertSame(5, $fresh->priority); + + // outcome is cast to the enum + $this->assertInstanceOf(CrawlOutcomeEnum::class, $fresh->outcome); + $this->assertSame(CrawlOutcomeEnum::Success, $fresh->outcome); + + // datetime casts + $this->assertInstanceOf(Carbon::class, $fresh->scheduled_for); + $this->assertInstanceOf(Carbon::class, $fresh->locked_at); + $this->assertInstanceOf(Carbon::class, $fresh->completed_at); + + $this->assertTrue($scheduledFor->equalTo($fresh->scheduled_for)); + $this->assertTrue($lockedAt->equalTo($fresh->locked_at)); + $this->assertTrue($completedAt->equalTo($fresh->completed_at)); + + // nullable columns + $this->assertNull($fresh->error_message); + + // status_code persists + $this->assertSame(200, $fresh->status_code); + } + + public function test_page_crawl_belongs_to_a_page(): void + { + $page = Page::factory()->create(['url' => 'https://example.com/page-2']); + + $crawl = PageCrawl::create([ + 'page_id' => $page->id, + 'domain' => 'example.com', + 'priority' => 1, + 'scheduled_for' => Carbon::now(), + ]); + + $related = $crawl->page; + + $this->assertInstanceOf(Page::class, $related); + $this->assertSame($page->id, $related->id); + } +} diff --git a/tests/Unit/Models/PageTest.php b/tests/Unit/Models/PageTest.php index 8319510..217c831 100644 --- a/tests/Unit/Models/PageTest.php +++ b/tests/Unit/Models/PageTest.php @@ -6,7 +6,9 @@ use App\Enums\PageStatusEnum; use App\Models\Page; +use App\Models\PageCrawl; use App\Models\PageLink; +use Carbon\Carbon; use Illuminate\Foundation\Testing\RefreshDatabase; use Lvl0\FediDiscover\Config\InstanceType; use Lvl0\FediDiscover\Models\Instance; @@ -97,6 +99,47 @@ public function test_page_language_is_fillable_and_persists(): void $this->assertNull($unset->fresh()->language); } + public function test_page_has_many_crawls(): void + { + $page = Page::factory()->create(); + $other = Page::factory()->create(); + + PageCrawl::create(['page_id' => $page->id, 'domain' => 'example.com']); + PageCrawl::create(['page_id' => $page->id, 'domain' => 'example.com']); + PageCrawl::create(['page_id' => $page->id, 'domain' => 'example.com']); + PageCrawl::create(['page_id' => $other->id, 'domain' => 'other.com']); + + $crawls = $page->fresh()->crawls; + + $this->assertCount(3, $crawls); + foreach ($crawls as $crawl) { + $this->assertInstanceOf(PageCrawl::class, $crawl); + $this->assertSame($page->id, $crawl->page_id); + } + } + + public function test_page_latest_crawl_returns_row_with_latest_created_at(): void + { + $page = Page::factory()->create(); + + $old = PageCrawl::create(['page_id' => $page->id, 'domain' => 'example.com']); + $old->created_at = Carbon::parse('2026-01-01 08:00:00'); + $old->save(); + + $middle = PageCrawl::create(['page_id' => $page->id, 'domain' => 'example.com']); + $middle->created_at = Carbon::parse('2026-03-15 12:00:00'); + $middle->save(); + + $newest = PageCrawl::create(['page_id' => $page->id, 'domain' => 'example.com', 'error_message' => 'sentinel-latest']); + $newest->created_at = Carbon::parse('2026-05-10 18:00:00'); + $newest->save(); + + $latest = $page->fresh()->latestCrawl; + + $this->assertInstanceOf(PageCrawl::class, $latest); + $this->assertSame('sentinel-latest', $latest->error_message); + } + public function test_page_status_is_cast_to_enum(): void { $cases = [ From f2c1fab4e4843e5322da2354fc5184ca7391615f Mon Sep 17 00:00:00 2001 From: myrmidex Date: Sun, 26 Apr 2026 14:23:13 +0200 Subject: [PATCH 26/65] 7 - Add int casts on PageCrawl and tests for cascade-delete + pending scope --- app/Models/PageCrawl.php | 2 ++ tests/Unit/Models/PageCrawlTest.php | 29 +++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+) diff --git a/app/Models/PageCrawl.php b/app/Models/PageCrawl.php index a615e77..8568bee 100644 --- a/app/Models/PageCrawl.php +++ b/app/Models/PageCrawl.php @@ -28,10 +28,12 @@ class PageCrawl extends Model ]; protected $casts = [ + 'priority' => 'integer', 'scheduled_for' => 'datetime', 'completed_at' => 'datetime', 'outcome' => CrawlOutcomeEnum::class, 'locked_at' => 'datetime', + 'status_code' => 'integer', ]; public function page(): BelongsTo diff --git a/tests/Unit/Models/PageCrawlTest.php b/tests/Unit/Models/PageCrawlTest.php index 107e542..b1f8152 100644 --- a/tests/Unit/Models/PageCrawlTest.php +++ b/tests/Unit/Models/PageCrawlTest.php @@ -79,4 +79,33 @@ public function test_page_crawl_belongs_to_a_page(): void $this->assertInstanceOf(Page::class, $related); $this->assertSame($page->id, $related->id); } + + public function test_deleting_a_page_cascades_to_its_page_crawls(): void + { + $page = Page::factory()->create(['url' => 'https://example.com/page-cascade']); + + PageCrawl::factory()->page($page)->create(); + PageCrawl::factory()->page($page)->successful()->create(); + PageCrawl::factory()->page($page)->failed('timeout during fetch')->create(); + + $this->assertSame(3, PageCrawl::count()); + + $page->delete(); + + $this->assertSame(0, PageCrawl::count()); + } + + public function test_pending_crawls_are_filtered_by_null_outcome(): void + { + $page = Page::factory()->create(['url' => 'https://example.com/page-pending']); + + $pending = PageCrawl::factory()->page($page)->create(); + PageCrawl::factory()->page($page)->successful()->create(); + PageCrawl::factory()->page($page)->failed('connection refused')->create(); + + $this->assertSame(1, PageCrawl::whereNull('outcome')->count()); + $this->assertSame($pending->id, PageCrawl::whereNull('outcome')->first()->id); + + $this->assertSame(2, PageCrawl::whereNotNull('outcome')->count()); + } } From 81209125a10b564858fbdb72db4c060bf18fef89 Mon Sep 17 00:00:00 2001 From: myrmidex Date: Sun, 26 Apr 2026 14:52:40 +0200 Subject: [PATCH 27/65] 8 - Add UrlService with host extraction method --- app/Services/UrlService.php | 35 +++++++++ tests/Unit/Services/UrlServiceTest.php | 101 +++++++++++++++++++++++++ 2 files changed, 136 insertions(+) create mode 100644 app/Services/UrlService.php create mode 100644 tests/Unit/Services/UrlServiceTest.php diff --git a/app/Services/UrlService.php b/app/Services/UrlService.php new file mode 100644 index 0000000..287f8fa --- /dev/null +++ b/app/Services/UrlService.php @@ -0,0 +1,35 @@ +scheme(); + if ($scheme === null || $scheme === '') { + throw new InvalidArgumentException("URL has no scheme: {$url}"); + } + if (! in_array($scheme, ['http', 'https'], true)) { + throw new InvalidArgumentException("Invalid URL scheme: {$scheme}"); + } + + $host = $uri->host(); + if ($host === null || $host === '') { + throw new InvalidArgumentException("URL has no host: {$url}"); + } + + if (filter_var(trim($host, '[]'), FILTER_VALIDATE_IP) !== false) { + throw new InvalidArgumentException("IP literal hosts not allowed: {$host}"); + } + + return strtolower($host); + } +} diff --git a/tests/Unit/Services/UrlServiceTest.php b/tests/Unit/Services/UrlServiceTest.php new file mode 100644 index 0000000..0632766 --- /dev/null +++ b/tests/Unit/Services/UrlServiceTest.php @@ -0,0 +1,101 @@ +service = new UrlService; + } + + // ------------------------------------------------------------------------- + // Happy path — simple URL + // ------------------------------------------------------------------------- + + public function test_extracts_host_from_simple_url(): void + { + $this->assertSame('example.com', $this->service->host('https://example.com')); + } + + // ------------------------------------------------------------------------- + // Path, query string, and fragment are ignored + // ------------------------------------------------------------------------- + + #[DataProvider('urlsWithNoise')] + public function test_extracts_host_ignoring_path_query_and_fragment(string $url, string $expectedHost): void + { + $this->assertSame($expectedHost, $this->service->host($url)); + } + + public static function urlsWithNoise(): array + { + return [ + 'path only' => ['https://example.com/some/path', 'example.com'], + 'path and query' => ['https://example.com/page?q=hello&lang=en', 'example.com'], + 'path, query, fragment' => ['https://example.com/page?q=1#section', 'example.com'], + 'http scheme with path' => ['http://news.ycombinator.com/item?id=42', 'news.ycombinator.com'], + ]; + } + + // ------------------------------------------------------------------------- + // Port number is stripped from the host + // ------------------------------------------------------------------------- + + public function test_strips_port_from_host(): void + { + $this->assertSame('example.com', $this->service->host('https://example.com:8080/path')); + } + + // ------------------------------------------------------------------------- + // Host is always returned as lowercase + // ------------------------------------------------------------------------- + + public function test_lowercases_host(): void + { + $this->assertSame('example.com', $this->service->host('https://EXAMPLE.COM/path')); + } + + // ------------------------------------------------------------------------- + // Throws on malformed, disallowed, or IP-literal input + // ------------------------------------------------------------------------- + + #[DataProvider('invalidInputs')] + public function test_throws_on_invalid_input(string $url): void + { + $this->expectException(\InvalidArgumentException::class); + + $this->service->host($url); + } + + public static function invalidInputs(): array + { + return [ + // malformed / missing structure + 'empty string' => [''], + 'no scheme' => ['example.com/path'], + 'scheme only' => ['https://'], + 'bare string' => ['not a url at all'], + + // disallowed schemes + 'javascript scheme' => ['javascript:alert(1)'], + 'ftp scheme' => ['ftp://example.com'], + 'data scheme' => ['data:text/html,

hi

'], + + // IP literals — not valid page-URL hosts for Trove's purposes + 'ipv4 literal' => ['https://192.168.1.1/path'], + 'ipv6 literal' => ['https://[::1]/path'], + 'ipv4 without path' => ['http://10.0.0.1'], + ]; + } +} From de14ae3ad4dba7acee86050bd2ef5796c32b329b Mon Sep 17 00:00:00 2001 From: myrmidex Date: Sun, 26 Apr 2026 15:56:38 +0200 Subject: [PATCH 28/65] 8 - Wire PageObserver to enqueue page_crawls on Page creation --- app/Models/Page.php | 3 + app/Observers/PageObserver.php | 25 +++++++ tests/Feature/PageQueuePopulationTest.php | 81 +++++++++++++++++++++++ tests/Unit/Models/PageCrawlTest.php | 12 ++-- tests/Unit/Models/PageTest.php | 11 ++- 5 files changed, 125 insertions(+), 7 deletions(-) create mode 100644 app/Observers/PageObserver.php create mode 100644 tests/Feature/PageQueuePopulationTest.php diff --git a/app/Models/Page.php b/app/Models/Page.php index 60ce74d..02a0a8e 100644 --- a/app/Models/Page.php +++ b/app/Models/Page.php @@ -5,7 +5,9 @@ namespace App\Models; use App\Enums\PageStatusEnum; +use App\Observers\PageObserver; use Database\Factories\PageFactory; +use Illuminate\Database\Eloquent\Attributes\ObservedBy; use Illuminate\Database\Eloquent\Factories\HasFactory; use Illuminate\Database\Eloquent\Model; use Illuminate\Database\Eloquent\Relations\BelongsTo; @@ -13,6 +15,7 @@ use Illuminate\Database\Eloquent\Relations\HasOne; use Lvl0\FediDiscover\Models\Instance; +#[ObservedBy([PageObserver::class])] class Page extends Model { /** @use HasFactory */ diff --git a/app/Observers/PageObserver.php b/app/Observers/PageObserver.php new file mode 100644 index 0000000..e48cc5d --- /dev/null +++ b/app/Observers/PageObserver.php @@ -0,0 +1,25 @@ + $page->id], + [ + 'domain' => $this->urlService->host($page->url), + 'priority' => 0, + ], + ); + } +} diff --git a/tests/Feature/PageQueuePopulationTest.php b/tests/Feature/PageQueuePopulationTest.php new file mode 100644 index 0000000..4204799 --- /dev/null +++ b/tests/Feature/PageQueuePopulationTest.php @@ -0,0 +1,81 @@ +create(['url' => $url]); + + $expectedDomain = (new UrlService)->host($url); + + $this->assertDatabaseHas('page_crawls', [ + 'page_id' => $page->id, + 'domain' => $expectedDomain, + 'priority' => 0, + ]); + + $crawl = PageCrawl::where('page_id', $page->id)->first(); + $this->assertNotNull($crawl); + $this->assertNotNull($crawl->scheduled_for); + } + + public function test_created_page_crawl_has_null_outcome(): void + { + $page = Page::factory()->create(['url' => 'https://example-blog.com/article']); + + $crawl = PageCrawl::where('page_id', $page->id)->first(); + + $this->assertNotNull($crawl); + $this->assertNull($crawl->outcome); + } + + public function test_first_or_create_with_existing_url_does_not_insert_duplicate_crawl(): void + { + $url = 'https://example-blog.com/article'; + + Page::factory()->create(['url' => $url]); + + // Finds the existing row — created event does not fire again + Page::firstOrCreate(['url' => $url], ['status' => 'discovered']); + + $this->assertDatabaseCount('page_crawls', 1); + } + + public function test_updating_a_page_does_not_insert_another_crawl(): void + { + $page = Page::factory()->create(['url' => 'https://example-blog.com/article']); + + $page->update(['title' => 'New Title']); + + $this->assertDatabaseCount('page_crawls', 1); + } + + public function test_bad_url_throws_exception_page_persists_no_crawl_inserted(): void + { + $caught = null; + + try { + Page::create(['url' => 'not-a-url', 'status' => 'discovered']); + } catch (\InvalidArgumentException $e) { + $caught = $e; + } + + $this->assertNotNull($caught, 'Expected InvalidArgumentException to be thrown'); + $this->assertDatabaseHas('pages', ['url' => 'not-a-url']); + $this->assertDatabaseCount('page_crawls', 0); + } +} diff --git a/tests/Unit/Models/PageCrawlTest.php b/tests/Unit/Models/PageCrawlTest.php index b1f8152..4c19294 100644 --- a/tests/Unit/Models/PageCrawlTest.php +++ b/tests/Unit/Models/PageCrawlTest.php @@ -17,7 +17,7 @@ class PageCrawlTest extends TestCase public function test_page_crawl_fillable_fields_persist_and_casts_are_applied(): void { - $page = Page::factory()->create(['url' => 'https://example.com/page-1']); + $page = Page::factory()->createQuietly(['url' => 'https://example.com/page-1']); $scheduledFor = Carbon::parse('2026-05-01 10:00:00'); $lockedAt = Carbon::parse('2026-05-01 10:01:00'); @@ -65,7 +65,7 @@ public function test_page_crawl_fillable_fields_persist_and_casts_are_applied(): public function test_page_crawl_belongs_to_a_page(): void { - $page = Page::factory()->create(['url' => 'https://example.com/page-2']); + $page = Page::factory()->createQuietly(['url' => 'https://example.com/page-2']); $crawl = PageCrawl::create([ 'page_id' => $page->id, @@ -82,7 +82,9 @@ public function test_page_crawl_belongs_to_a_page(): void public function test_deleting_a_page_cascades_to_its_page_crawls(): void { - $page = Page::factory()->create(['url' => 'https://example.com/page-cascade']); + // createQuietly() skips the PageObserver so the count of explicit rows is predictable; + // this test is about cascade delete behaviour, not observer side effects. + $page = Page::factory()->createQuietly(['url' => 'https://example.com/page-cascade']); PageCrawl::factory()->page($page)->create(); PageCrawl::factory()->page($page)->successful()->create(); @@ -97,7 +99,9 @@ public function test_deleting_a_page_cascades_to_its_page_crawls(): void public function test_pending_crawls_are_filtered_by_null_outcome(): void { - $page = Page::factory()->create(['url' => 'https://example.com/page-pending']); + // createQuietly() skips the PageObserver; this test counts rows with null/non-null + // outcome — the auto-inserted observer crawl (outcome=null) would corrupt both counts. + $page = Page::factory()->createQuietly(['url' => 'https://example.com/page-pending']); $pending = PageCrawl::factory()->page($page)->create(); PageCrawl::factory()->page($page)->successful()->create(); diff --git a/tests/Unit/Models/PageTest.php b/tests/Unit/Models/PageTest.php index 217c831..27e9740 100644 --- a/tests/Unit/Models/PageTest.php +++ b/tests/Unit/Models/PageTest.php @@ -101,8 +101,10 @@ public function test_page_language_is_fillable_and_persists(): void public function test_page_has_many_crawls(): void { - $page = Page::factory()->create(); - $other = Page::factory()->create(); + // createQuietly() skips the PageObserver so no auto-crawl row is inserted; + // this test is about HasMany scoping, not observer side effects. + $page = Page::factory()->createQuietly(); + $other = Page::factory()->createQuietly(); PageCrawl::create(['page_id' => $page->id, 'domain' => 'example.com']); PageCrawl::create(['page_id' => $page->id, 'domain' => 'example.com']); @@ -120,7 +122,10 @@ public function test_page_has_many_crawls(): void public function test_page_latest_crawl_returns_row_with_latest_created_at(): void { - $page = Page::factory()->create(); + // createQuietly() skips the PageObserver; this test is about latestOfMany ordering, + // not observer side effects. Using create() would add an observer crawl whose + // created_at is now(), making the test fragile once the hardcoded sentinel date passes. + $page = Page::factory()->createQuietly(); $old = PageCrawl::create(['page_id' => $page->id, 'domain' => 'example.com']); $old->created_at = Carbon::parse('2026-01-01 08:00:00'); From 6f75be7328f138a98e3bfa5686748637dbbc8a7d Mon Sep 17 00:00:00 2001 From: myrmidex Date: Sun, 26 Apr 2026 16:09:28 +0200 Subject: [PATCH 29/65] 8 - Tighten UrlService validation and add observer integration tests --- app/Services/UrlService.php | 9 +++++++-- tests/Feature/UrlDiscoveryTest.php | 24 ++++++++++++++++++++++++ tests/Feature/UrlSubmissionTest.php | 15 +++++++++++++++ tests/Unit/Services/UrlServiceTest.php | 10 ++++++++++ 4 files changed, 56 insertions(+), 2 deletions(-) diff --git a/app/Services/UrlService.php b/app/Services/UrlService.php index 287f8fa..6b1700c 100644 --- a/app/Services/UrlService.php +++ b/app/Services/UrlService.php @@ -21,15 +21,20 @@ public function host(string $url): string throw new InvalidArgumentException("Invalid URL scheme: {$scheme}"); } + if ($uri->user() !== null) { + throw new InvalidArgumentException("URLs with embedded credentials not allowed: {$url}"); + } + $host = $uri->host(); if ($host === null || $host === '') { throw new InvalidArgumentException("URL has no host: {$url}"); } - if (filter_var(trim($host, '[]'), FILTER_VALIDATE_IP) !== false) { + $bareHost = preg_replace('/%.*$/', '', trim($host, '[]')); + if (filter_var($bareHost, FILTER_VALIDATE_IP) !== false) { throw new InvalidArgumentException("IP literal hosts not allowed: {$host}"); } - return strtolower($host); + return mb_strtolower($host); } } diff --git a/tests/Feature/UrlDiscoveryTest.php b/tests/Feature/UrlDiscoveryTest.php index ff36ed2..9d0782f 100644 --- a/tests/Feature/UrlDiscoveryTest.php +++ b/tests/Feature/UrlDiscoveryTest.php @@ -119,6 +119,30 @@ public function test_listener_with_null_post_url_creates_only_target_page(): voi $this->assertSame($instance->id, $targetPage->instance_id); } + // --------------------------------------------------------------------------- + // Integration — UrlDiscovered event enqueues crawls for both pages via observer + // --------------------------------------------------------------------------- + + public function test_url_discovered_event_enqueues_crawls_via_observer(): void + { + $instance = $this->makeInstance(); + + $event = new UrlDiscovered( + url: 'https://example-blog.com/article', + instanceId: $instance->id, + discoveredAt: CarbonImmutable::parse('2026-04-26T12:00:00Z'), + postUrl: 'https://mastodon.social/@alice/109876543210', + postBody: 'check this out https://example-blog.com/article', + ); + + event($event); + + // Listener creates 2 pages (target + source); observer fires for each → 2 crawl rows + $this->assertDatabaseCount('page_crawls', 2); + $this->assertDatabaseHas('page_crawls', ['domain' => 'example-blog.com']); + $this->assertDatabaseHas('page_crawls', ['domain' => 'mastodon.social']); + } + // --------------------------------------------------------------------------- // Test 12 — listener is queued, not run inline // --------------------------------------------------------------------------- diff --git a/tests/Feature/UrlSubmissionTest.php b/tests/Feature/UrlSubmissionTest.php index e2046cd..bc76e44 100644 --- a/tests/Feature/UrlSubmissionTest.php +++ b/tests/Feature/UrlSubmissionTest.php @@ -118,6 +118,21 @@ public static function invalidUrls(): array ]; } + // ------------------------------------------------------------------------- + // Integration — form submission enqueues a crawl via PageObserver + // ------------------------------------------------------------------------- + + public function test_url_submission_form_enqueues_crawl_via_observer(): void + { + Livewire::test(UrlSubmissionForm::class) + ->set('url', 'https://example.com/article') + ->call('submit') + ->assertHasNoErrors(); + + $this->assertDatabaseCount('page_crawls', 1); + $this->assertDatabaseHas('page_crawls', ['domain' => 'example.com']); + } + // ------------------------------------------------------------------------- // Test 7 — rate limit blocks the 11th submission within a minute // ------------------------------------------------------------------------- diff --git a/tests/Unit/Services/UrlServiceTest.php b/tests/Unit/Services/UrlServiceTest.php index 0632766..97ea9a5 100644 --- a/tests/Unit/Services/UrlServiceTest.php +++ b/tests/Unit/Services/UrlServiceTest.php @@ -96,6 +96,16 @@ public static function invalidInputs(): array 'ipv4 literal' => ['https://192.168.1.1/path'], 'ipv6 literal' => ['https://[::1]/path'], 'ipv4 without path' => ['http://10.0.0.1'], + + // Embedded credentials (userinfo) — phishing/SSRF flag + 'embedded credentials' => ['https://user:pass@example.com/'], + 'username only' => ['https://user@example.com/'], + + // IPv6 with zone identifier — zone suffix defeats FILTER_VALIDATE_IP + 'ipv6 with zone' => ['https://[fe80::1%25eth0]/'], + + // IPv4-mapped IPv6 — FILTER_VALIDATE_IP recognises ::ffff:x.x.x.x as valid IPv6 + 'ipv4 mapped ipv6' => ['https://[::ffff:192.0.2.1]/path'], ]; } } From abbcedf2e7ea2681838468d9d0de33e0cc21b04e Mon Sep 17 00:00:00 2001 From: myrmidex Date: Sun, 26 Apr 2026 16:35:46 +0200 Subject: [PATCH 30/65] 12 - Add Rejected case to CrawlOutcomeEnum and PageStatusEnum --- app/Enums/CrawlOutcomeEnum.php | 1 + app/Enums/PageStatusEnum.php | 1 + tests/Unit/Enums/CrawlOutcomeEnumTest.php | 5 ++-- tests/Unit/Enums/PageStatusEnumTest.php | 33 +++++++++++++++++++++++ 4 files changed, 38 insertions(+), 2 deletions(-) create mode 100644 tests/Unit/Enums/PageStatusEnumTest.php diff --git a/app/Enums/CrawlOutcomeEnum.php b/app/Enums/CrawlOutcomeEnum.php index e7f16dd..c4e912e 100644 --- a/app/Enums/CrawlOutcomeEnum.php +++ b/app/Enums/CrawlOutcomeEnum.php @@ -12,4 +12,5 @@ enum CrawlOutcomeEnum: string case BlockedRobots = 'blocked_robots'; case Blocked4xx = 'blocked_4xx'; case Blocked5xx = 'blocked_5xx'; + case Rejected = 'rejected'; } diff --git a/app/Enums/PageStatusEnum.php b/app/Enums/PageStatusEnum.php index 4f73260..ed3abe0 100644 --- a/app/Enums/PageStatusEnum.php +++ b/app/Enums/PageStatusEnum.php @@ -9,4 +9,5 @@ enum PageStatusEnum: string case Discovered = 'discovered'; case Fetched = 'fetched'; case Failed = 'failed'; + case Rejected = 'rejected'; } diff --git a/tests/Unit/Enums/CrawlOutcomeEnumTest.php b/tests/Unit/Enums/CrawlOutcomeEnumTest.php index 3672512..56261cb 100644 --- a/tests/Unit/Enums/CrawlOutcomeEnumTest.php +++ b/tests/Unit/Enums/CrawlOutcomeEnumTest.php @@ -18,6 +18,7 @@ public function test_all_expected_cases_exist_with_correct_backing_values(): voi 'BlockedRobots' => 'blocked_robots', 'Blocked4xx' => 'blocked_4xx', 'Blocked5xx' => 'blocked_5xx', + 'Rejected' => 'rejected', ]; foreach ($expected as $caseName => $backingValue) { @@ -28,8 +29,8 @@ public function test_all_expected_cases_exist_with_correct_backing_values(): voi } } - public function test_enum_has_exactly_six_cases(): void + public function test_enum_has_exactly_seven_cases(): void { - $this->assertCount(6, CrawlOutcomeEnum::cases()); + $this->assertCount(7, CrawlOutcomeEnum::cases()); } } diff --git a/tests/Unit/Enums/PageStatusEnumTest.php b/tests/Unit/Enums/PageStatusEnumTest.php new file mode 100644 index 0000000..63d240d --- /dev/null +++ b/tests/Unit/Enums/PageStatusEnumTest.php @@ -0,0 +1,33 @@ + 'discovered', + 'Fetched' => 'fetched', + 'Failed' => 'failed', + 'Rejected' => 'rejected', + ]; + + foreach ($expected as $caseName => $backingValue) { + $case = PageStatusEnum::from($backingValue); + + $this->assertSame($caseName, $case->name, "Case name for '{$backingValue}' should be '{$caseName}'"); + $this->assertSame($backingValue, $case->value, "Backing value for '{$caseName}' should be '{$backingValue}'"); + } + } + + public function test_enum_has_exactly_four_cases(): void + { + $this->assertCount(4, PageStatusEnum::cases()); + } +} From a9f2d689ae8b8fd3a868c15432ce2a68b987f79d Mon Sep 17 00:00:00 2001 From: myrmidex Date: Sun, 26 Apr 2026 16:45:07 +0200 Subject: [PATCH 31/65] 12 - Add crawler config and FetchResult value object --- app/ValueObjects/FetchResult.php | 22 ++++++++++++ tests/Unit/ValueObjects/FetchResultTest.php | 37 +++++++++++++++++++++ 2 files changed, 59 insertions(+) create mode 100644 app/ValueObjects/FetchResult.php create mode 100644 tests/Unit/ValueObjects/FetchResultTest.php diff --git a/app/ValueObjects/FetchResult.php b/app/ValueObjects/FetchResult.php new file mode 100644 index 0000000..e8339dc --- /dev/null +++ b/app/ValueObjects/FetchResult.php @@ -0,0 +1,22 @@ +assertSame(CrawlOutcomeEnum::Success, $result->outcome); + $this->assertSame(200, $result->statusCode); + $this->assertSame('https://example.com/article', $result->finalUrl); + $this->assertSame('An Example Article', $result->title); + $this->assertSame('Lorem ipsum dolor sit amet.', $result->extractedText); + $this->assertInstanceOf(Collection::class, $result->outboundLinks); + $this->assertSame(['https://other.com', 'https://another.com'], $result->outboundLinks->all()); + $this->assertSame(5, $result->wordCount); + $this->assertNull($result->errorMessage); + } +} From bb7906e193f8f0910ebae82f77fc452f900fc9c0 Mon Sep 17 00:00:00 2001 From: myrmidex Date: Sun, 26 Apr 2026 16:50:43 +0200 Subject: [PATCH 32/65] 12 - Make FetchResult fields nullable and add missing crawler config --- app/ValueObjects/FetchResult.php | 13 +++--- config/crawler.php | 44 +++++++++++++++++++++ tests/Unit/ValueObjects/FetchResultTest.php | 23 +++++++++++ 3 files changed, 75 insertions(+), 5 deletions(-) create mode 100644 config/crawler.php diff --git a/app/ValueObjects/FetchResult.php b/app/ValueObjects/FetchResult.php index e8339dc..86e04bb 100644 --- a/app/ValueObjects/FetchResult.php +++ b/app/ValueObjects/FetchResult.php @@ -9,14 +9,17 @@ class FetchResult { + /** + * @param Collection $outboundLinks + */ public function __construct( public CrawlOutcomeEnum $outcome, - public int $statusCode, - public string $finalUrl, - public string $title, - public string $extractedText, + public ?int $statusCode, + public ?string $finalUrl, + public ?string $title, + public ?string $extractedText, public Collection $outboundLinks, - public int $wordCount, + public ?int $wordCount, public ?string $errorMessage, ) {} } diff --git a/config/crawler.php b/config/crawler.php new file mode 100644 index 0000000..0c9caeb --- /dev/null +++ b/config/crawler.php @@ -0,0 +1,44 @@ + env('CRAWLER_TIMEOUT', 10), + + /* + |--------------------------------------------------------------------------- + | Maximum redirects to follow + |--------------------------------------------------------------------------- + | + | Guzzle default is 5. Cross-origin redirects are accepted in v0.1 — the + | search engine treats the post-redirect URL as the canonical one for + | indexing. + | + */ + + 'max_redirects' => env('CRAWLER_MAX_REDIRECTS', 5), + + /* + |--------------------------------------------------------------------------- + | User-Agent + |--------------------------------------------------------------------------- + | + | Identifies our crawler to target servers. The placeholder below is for + | v0.1 development; ticket #10 replaces it with the production identity + | and adds a `/bot` info page that the URL points at. + | + */ + + 'user_agent' => env('CRAWLER_USER_AGENT', 'TroveBot/0.1 (+https://trove.lvl0.xyz/bot)'), +]; diff --git a/tests/Unit/ValueObjects/FetchResultTest.php b/tests/Unit/ValueObjects/FetchResultTest.php index 852ea78..c3185f8 100644 --- a/tests/Unit/ValueObjects/FetchResultTest.php +++ b/tests/Unit/ValueObjects/FetchResultTest.php @@ -34,4 +34,27 @@ public function test_it_exposes_all_fields(): void $this->assertSame(5, $result->wordCount); $this->assertNull($result->errorMessage); } + + public function test_it_accepts_null_for_failure_outcome_fields(): void + { + $result = new FetchResult( + outcome: CrawlOutcomeEnum::Failed, + statusCode: null, + finalUrl: null, + title: null, + extractedText: null, + outboundLinks: collect(), + wordCount: null, + errorMessage: 'Could not connect', + ); + + $this->assertSame(CrawlOutcomeEnum::Failed, $result->outcome); + $this->assertNull($result->statusCode); + $this->assertNull($result->finalUrl); + $this->assertNull($result->title); + $this->assertNull($result->extractedText); + $this->assertSame([], $result->outboundLinks->all()); + $this->assertNull($result->wordCount); + $this->assertSame('Could not connect', $result->errorMessage); + } } From 1b7fbbfd0c9605a24d325de9b563dda2a6bc556d Mon Sep 17 00:00:00 2001 From: myrmidex Date: Sun, 26 Apr 2026 17:56:13 +0200 Subject: [PATCH 33/65] 12 - Add FetchPageAction with Http::fake-driven outcome paths --- app/Actions/FetchPageAction.php | 98 +++++++++++ tests/Feature/Actions/FetchPageActionTest.php | 156 ++++++++++++++++++ 2 files changed, 254 insertions(+) create mode 100644 app/Actions/FetchPageAction.php create mode 100644 tests/Feature/Actions/FetchPageActionTest.php diff --git a/app/Actions/FetchPageAction.php b/app/Actions/FetchPageAction.php new file mode 100644 index 0000000..9a69018 --- /dev/null +++ b/app/Actions/FetchPageAction.php @@ -0,0 +1,98 @@ +http + ->timeout(config('crawler.timeout')) + ->withHeaders([ + 'User-Agent' => config('crawler.user_agent'), + 'Accept' => 'text/html', + ]) + ->withOptions([ + 'allow_redirects' => ['max' => config('crawler.max_redirects')], + ]) + ->get($url); + + } catch (ConnectionException|ConnectException $e) { + return $this->failureResult($e); + } + + [$outcome, $error] = $this->validateResponse($response); + + return new FetchResult( + outcome: $outcome, + statusCode: $response->status(), + finalUrl: $url, + title: null, + extractedText: null, + outboundLinks: collect(), + wordCount: null, + errorMessage: $error ?? null, + ); + } + + private function validateResponse(Response $response): array + { + $status = $response->status(); + $statusStart = substr((string) $status, 0, 1); + + if ($statusStart === '4') { + return [CrawlOutcomeEnum::Blocked4xx, "HTTP {$status}"]; + } + + if (str_starts_with((string) $status, '5')) { + return [CrawlOutcomeEnum::Blocked5xx, "HTTP {$status}"]; + } + + $contentType = $response->header('Content-Type'); + if (! str_starts_with($contentType, 'text/html')) { + return [CrawlOutcomeEnum::Rejected, "Unsupported file type: {$contentType}"]; + } + + return [CrawlOutcomeEnum::Success, null]; + } + + private function failureResult(ConnectionException|ConnectException $e): FetchResult + { + $guzzleException = $e instanceof ConnectException + ? $e + : ($e->getPrevious() instanceof ConnectException + ? $e->getPrevious() + : null); + + $errno = $guzzleException?->getHandlerContext()['errno'] ?? null; + + $outcome = $errno === CURLE_OPERATION_TIMEDOUT + ? CrawlOutcomeEnum::Timeout + : CrawlOutcomeEnum::Failed; + + return new FetchResult( + outcome: $outcome, + statusCode: null, + finalUrl: null, + title: null, + extractedText: null, + outboundLinks: collect(), + wordCount: null, + errorMessage: $e->getMessage(), + ); + } +} diff --git a/tests/Feature/Actions/FetchPageActionTest.php b/tests/Feature/Actions/FetchPageActionTest.php new file mode 100644 index 0000000..1b399a8 --- /dev/null +++ b/tests/Feature/Actions/FetchPageActionTest.php @@ -0,0 +1,156 @@ + Http::response( + 'Hello', + 200, + ['Content-Type' => 'text/html'], + ), + ]); + + $result = $this->makeAction()('https://example.com/page'); + + $this->assertInstanceOf(FetchResult::class, $result); + $this->assertSame(CrawlOutcomeEnum::Success, $result->outcome); + $this->assertSame(200, $result->statusCode); + $this->assertNotNull($result->finalUrl); + } + + public function test_4xx_response_returns_blocked_4xx(): void + { + Http::fake([ + 'example.com/*' => Http::response('Not Found', 404), + ]); + + $result = $this->makeAction()('https://example.com/missing'); + + $this->assertInstanceOf(FetchResult::class, $result); + $this->assertSame(CrawlOutcomeEnum::Blocked4xx, $result->outcome); + $this->assertSame(404, $result->statusCode); + $this->assertIsString($result->errorMessage); + $this->assertStringContainsString('404', $result->errorMessage); + $this->assertNotNull($result->finalUrl); + } + + public function test_5xx_response_returns_blocked_5xx(): void + { + Http::fake([ + 'example.com/*' => Http::response('Service Unavailable', 503), + ]); + + $result = $this->makeAction()('https://example.com/page'); + + $this->assertInstanceOf(FetchResult::class, $result); + $this->assertSame(CrawlOutcomeEnum::Blocked5xx, $result->outcome); + $this->assertSame(503, $result->statusCode); + $this->assertIsString($result->errorMessage); + $this->assertStringContainsString('503', $result->errorMessage); + $this->assertNotNull($result->finalUrl); + } + + public function test_non_html_content_type_returns_rejected(): void + { + Http::fake([ + 'example.com/*' => Http::response( + 'PDF binary stuff', + 200, + ['Content-Type' => 'application/pdf'], + ), + ]); + + $result = $this->makeAction()('https://example.com/document.pdf'); + + $this->assertInstanceOf(FetchResult::class, $result); + $this->assertSame(CrawlOutcomeEnum::Rejected, $result->outcome); + $this->assertSame(200, $result->statusCode); + $this->assertIsString($result->errorMessage); + $this->assertStringContainsString('application/pdf', $result->errorMessage); + $this->assertNotNull($result->finalUrl); + $this->assertNull($result->title); + $this->assertNull($result->extractedText); + $this->assertEmpty($result->outboundLinks); + $this->assertNull($result->wordCount); + } + + public function test_text_html_with_charset_is_accepted(): void + { + Http::fake([ + 'example.com/*' => Http::response( + 'Hello charset world', + 200, + ['Content-Type' => 'text/html; charset=utf-8'], + ), + ]); + + $result = $this->makeAction()('https://example.com/page'); + + $this->assertInstanceOf(FetchResult::class, $result); + $this->assertSame(CrawlOutcomeEnum::Success, $result->outcome); + $this->assertSame(200, $result->statusCode); + } + + public function test_connection_failure_returns_failed(): void + { + Http::fake(function () { + throw new ConnectException( + 'Could not resolve host', + new Request('GET', 'https://example.com/page'), + null, + ['errno' => 6], + ); + }); + + $result = $this->makeAction()('https://example.com/page'); + + $this->assertInstanceOf(FetchResult::class, $result); + $this->assertSame(CrawlOutcomeEnum::Failed, $result->outcome); + $this->assertNull($result->statusCode); + $this->assertNull($result->finalUrl); + $this->assertIsString($result->errorMessage); + $this->assertNull($result->title); + $this->assertNull($result->extractedText); + $this->assertEmpty($result->outboundLinks); + $this->assertNull($result->wordCount); + } + + public function test_timeout_returns_timeout(): void + { + Http::fake(function () { + throw new ConnectException( + 'cURL error 28: Operation timed out', + new Request('GET', 'https://example.com/page'), + null, + ['errno' => 28], + ); + }); + + $result = $this->makeAction()('https://example.com/page'); + + $this->assertInstanceOf(FetchResult::class, $result); + $this->assertSame(CrawlOutcomeEnum::Timeout, $result->outcome); + $this->assertNull($result->statusCode); + $this->assertNull($result->finalUrl); + $this->assertIsString($result->errorMessage); + } + + private function makeAction(): FetchPageAction + { + return app(FetchPageAction::class); + } +} From 3e2fd0d2c4259f0f24a30817138608742d846f0a Mon Sep 17 00:00:00 2001 From: myrmidex Date: Sun, 26 Apr 2026 18:13:39 +0200 Subject: [PATCH 34/65] chore - Add dev-composer command --- shell.nix | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/shell.nix b/shell.nix index b07745c..c17534f 100644 --- a/shell.nix +++ b/shell.nix @@ -92,6 +92,10 @@ pkgs.mkShell { podman-compose -f $COMPOSE_FILE exec app php artisan "$@" } + dev-composer() { + podman-compose -f $COMPOSE_FILE exec app composer "$@" + } + # =================== # BUILD COMMANDS # =================== @@ -141,6 +145,7 @@ pkgs.mkShell { echo " dev-logs-redis Tail Redis logs" echo " dev-shell Shell into app container" echo " dev-artisan Run artisan command" + echo " dev-composer Run composer command" echo " base-build Build and push image" echo "" echo "Services:" From 35e114782319281e5b93a84490956eda28ac8139 Mon Sep 17 00:00:00 2001 From: myrmidex Date: Sun, 26 Apr 2026 19:35:04 +0200 Subject: [PATCH 35/65] 12 - Add HTML content extraction (title, text, links, word count) --- app/Actions/FetchPageAction.php | 68 +++++- composer.json | 4 +- composer.lock | 211 +++++++++++++++++- tests/Feature/Actions/FetchPageActionTest.php | 111 +++++++++ 4 files changed, 386 insertions(+), 8 deletions(-) diff --git a/app/Actions/FetchPageAction.php b/app/Actions/FetchPageAction.php index 9a69018..7b98142 100644 --- a/app/Actions/FetchPageAction.php +++ b/app/Actions/FetchPageAction.php @@ -5,11 +5,18 @@ namespace App\Actions; use App\Enums\CrawlOutcomeEnum; +use App\Services\UrlService; use App\ValueObjects\FetchResult; +use fivefilters\Readability\Configuration; +use fivefilters\Readability\Readability; use GuzzleHttp\Exception\ConnectException; use Illuminate\Http\Client\ConnectionException; use Illuminate\Http\Client\Factory; use Illuminate\Http\Client\Response; +use InvalidArgumentException; +use League\Uri\BaseUri; +use Symfony\Component\DomCrawler\Crawler; +use Throwable; class FetchPageAction { @@ -37,14 +44,19 @@ public function __invoke(string $url): FetchResult [$outcome, $error] = $this->validateResponse($response); + if ($outcome === CrawlOutcomeEnum::Success) { + [$title, $extractedText, $links] = $this->extractTitleTextAndLinks($response->body(), $url); + $wordCount = count(preg_split('/\s+/u', trim($extractedText))); + } + return new FetchResult( outcome: $outcome, statusCode: $response->status(), finalUrl: $url, - title: null, - extractedText: null, - outboundLinks: collect(), - wordCount: null, + title: $title ?? null, + extractedText: $extractedText ?? null, + outboundLinks: $links ?? collect(), + wordCount: $wordCount ?? null, errorMessage: $error ?? null, ); } @@ -95,4 +107,52 @@ private function failureResult(ConnectionException|ConnectException $e): FetchRe errorMessage: $e->getMessage(), ); } + + private function extractTitleTextAndLinks(string $body, string $url): array + { + $crawler = new Crawler($body); + + $title = $crawler->filter('title')->count() > 0 + ? trim($crawler->filter('title')->text()) + : null; + + $readability = new Readability(new Configuration); + $readability->parse($body); + $mainContent = $readability->getContent() ?? ''; + $extractedText = trim(strip_tags($mainContent)); + + $links = collect(); + if ($mainContent !== '') { + $linkCrawler = new Crawler($mainContent); + if ($linkCrawler->filter('a[href]')->count() > 0) { + $links = collect($linkCrawler->filter('a[href]')->extract(['href'])); + } + } + + $linksResolved = $links + ->map(fn (string $href) => $this->resolveAndValidateLink($href, $url)) + ->filter() + ->unique() + ->values(); + + return [$title, $extractedText, $linksResolved]; + } + + private function resolveAndValidateLink(string $href, string $finalUrl): ?string + { + try { + $resolved = (string) BaseUri::from($finalUrl)->resolve($href); + $resolved = strstr($resolved, '#', true) ?: $resolved; + } catch (Throwable) { + return null; + } + + try { + app(UrlService::class)->host($resolved); + } catch (InvalidArgumentException) { + return null; + } + + return $resolved; + } } diff --git a/composer.json b/composer.json index dcb3aca..8494562 100644 --- a/composer.json +++ b/composer.json @@ -16,10 +16,12 @@ ], "require": { "php": "^8.3", + "fivefilters/readability.php": "@dev", "laravel/framework": "^13.0", "laravel/tinker": "^3.0", "livewire/livewire": "^4.2", - "lvl0/fedi-discover": "@dev" + "lvl0/fedi-discover": "@dev", + "symfony/dom-crawler": "^7.4" }, "require-dev": { "fakerphp/faker": "^1.23", diff --git a/composer.lock b/composer.lock index 15b7993..06c83c4 100644 --- a/composer.lock +++ b/composer.lock @@ -4,7 +4,7 @@ "Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies", "This file is @generated automatically" ], - "content-hash": "e46e58784ec34415557c78db6bb6c97e", + "content-hash": "30d45d9b30092cc20f9364f7c3828aa5", "packages": [ { "name": "brick/math", @@ -508,6 +508,71 @@ ], "time": "2025-03-06T22:45:56+00:00" }, + { + "name": "fivefilters/readability.php", + "version": "v3.3.3", + "source": { + "type": "git", + "url": "https://github.com/fivefilters/readability.php.git", + "reference": "e2ee7b9e49eae89ac7ed2c74b15718100a73b4c8" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/fivefilters/readability.php/zipball/e2ee7b9e49eae89ac7ed2c74b15718100a73b4c8", + "reference": "e2ee7b9e49eae89ac7ed2c74b15718100a73b4c8", + "shasum": "" + }, + "require": { + "ext-dom": "*", + "ext-mbstring": "*", + "ext-xml": "*", + "league/uri": "^7.0", + "masterminds/html5": "^2.0", + "php": ">=8.1", + "psr/log": "^1.0 || ^2.0 || ^3.0" + }, + "require-dev": { + "monolog/monolog": "^3.0", + "phpunit/phpunit": "^10.0 || ^11.0" + }, + "suggest": { + "monolog/monolog": "Allow logging debug information" + }, + "type": "library", + "autoload": { + "psr-4": { + "fivefilters\\Readability\\": "src/" + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "Apache-2.0" + ], + "authors": [ + { + "name": "Andres Rey", + "email": "andreskrey@gmail.com", + "role": "Original Developer" + }, + { + "name": "Keyvan Minoukadeh", + "email": "keyvan@fivefilters.org", + "homepage": "https://www.fivefilters.org", + "role": "Developer/Maintainer" + } + ], + "description": "A PHP port of Readability.js", + "homepage": "https://github.com/fivefilters/readability.php", + "keywords": [ + "html", + "readability" + ], + "support": { + "issues": "https://github.com/fivefilters/readability.php/issues", + "source": "https://github.com/fivefilters/readability.php/tree/v3.3.3" + }, + "time": "2025-04-26T23:45:37+00:00" + }, { "name": "fruitcake/php-cors", "version": "v1.4.0", @@ -2102,7 +2167,7 @@ }, { "name": "lvl0/fedi-discover", - "version": "dev-main", + "version": "dev-release/0.1.0", "dist": { "type": "path", "url": "packages/Lvl0/FediDiscover", @@ -2142,6 +2207,73 @@ "relative": true } }, + { + "name": "masterminds/html5", + "version": "2.10.0", + "source": { + "type": "git", + "url": "https://github.com/Masterminds/html5-php.git", + "reference": "fcf91eb64359852f00d921887b219479b4f21251" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/Masterminds/html5-php/zipball/fcf91eb64359852f00d921887b219479b4f21251", + "reference": "fcf91eb64359852f00d921887b219479b4f21251", + "shasum": "" + }, + "require": { + "ext-dom": "*", + "php": ">=5.3.0" + }, + "require-dev": { + "phpunit/phpunit": "^4.8.35 || ^5.7.21 || ^6 || ^7 || ^8 || ^9" + }, + "type": "library", + "extra": { + "branch-alias": { + "dev-master": "2.7-dev" + } + }, + "autoload": { + "psr-4": { + "Masterminds\\": "src" + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "Matt Butcher", + "email": "technosophos@gmail.com" + }, + { + "name": "Matt Farina", + "email": "matt@mattfarina.com" + }, + { + "name": "Asmir Mustafic", + "email": "goetas@gmail.com" + } + ], + "description": "An HTML5 parser and serializer.", + "homepage": "http://masterminds.github.io/html5-php", + "keywords": [ + "HTML5", + "dom", + "html", + "parser", + "querypath", + "serializer", + "xml" + ], + "support": { + "issues": "https://github.com/Masterminds/html5-php/issues", + "source": "https://github.com/Masterminds/html5-php/tree/2.10.0" + }, + "time": "2025-07-25T09:04:22+00:00" + }, { "name": "monolog/monolog", "version": "3.10.0", @@ -3729,6 +3861,78 @@ ], "time": "2024-09-25T14:21:43+00:00" }, + { + "name": "symfony/dom-crawler", + "version": "v7.4.8", + "source": { + "type": "git", + "url": "https://github.com/symfony/dom-crawler.git", + "reference": "2918e7c2ba964defca1f5b69c6f74886529e2dc8" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/symfony/dom-crawler/zipball/2918e7c2ba964defca1f5b69c6f74886529e2dc8", + "reference": "2918e7c2ba964defca1f5b69c6f74886529e2dc8", + "shasum": "" + }, + "require": { + "masterminds/html5": "^2.6", + "php": ">=8.2", + "symfony/deprecation-contracts": "^2.5|^3", + "symfony/polyfill-ctype": "~1.8", + "symfony/polyfill-mbstring": "~1.0" + }, + "require-dev": { + "symfony/css-selector": "^6.4|^7.0|^8.0" + }, + "type": "library", + "autoload": { + "psr-4": { + "Symfony\\Component\\DomCrawler\\": "" + }, + "exclude-from-classmap": [ + "/Tests/" + ] + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "Fabien Potencier", + "email": "fabien@symfony.com" + }, + { + "name": "Symfony Community", + "homepage": "https://symfony.com/contributors" + } + ], + "description": "Eases DOM navigation for HTML and XML documents", + "homepage": "https://symfony.com", + "support": { + "source": "https://github.com/symfony/dom-crawler/tree/v7.4.8" + }, + "funding": [ + { + "url": "https://symfony.com/sponsor", + "type": "custom" + }, + { + "url": "https://github.com/fabpot", + "type": "github" + }, + { + "url": "https://github.com/nicolas-grekas", + "type": "github" + }, + { + "url": "https://tidelift.com/funding/github/packagist/symfony/symfony", + "type": "tidelift" + } + ], + "time": "2026-03-24T13:12:05+00:00" + }, { "name": "symfony/error-handler", "version": "v7.4.8", @@ -8445,6 +8649,7 @@ "aliases": [], "minimum-stability": "stable", "stability-flags": { + "fivefilters/readability.php": 20, "lvl0/fedi-discover": 20 }, "prefer-stable": true, @@ -8453,5 +8658,5 @@ "php": "^8.3" }, "platform-dev": {}, - "plugin-api-version": "2.9.0" + "plugin-api-version": "2.6.0" } diff --git a/tests/Feature/Actions/FetchPageActionTest.php b/tests/Feature/Actions/FetchPageActionTest.php index 1b399a8..6925e96 100644 --- a/tests/Feature/Actions/FetchPageActionTest.php +++ b/tests/Feature/Actions/FetchPageActionTest.php @@ -9,6 +9,7 @@ use App\ValueObjects\FetchResult; use GuzzleHttp\Exception\ConnectException; use GuzzleHttp\Psr7\Request; +use Illuminate\Support\Collection; use Illuminate\Support\Facades\Http; use Tests\TestCase; @@ -149,6 +150,116 @@ public function test_timeout_returns_timeout(): void $this->assertIsString($result->errorMessage); } + public function test_success_extracts_title_from_html(): void + { + Http::fake([ + 'example.com/*' => Http::response( + 'My Page Title

Some content.

', + 200, + ['Content-Type' => 'text/html'], + ), + ]); + + $result = $this->makeAction()('https://example.com/page'); + + $this->assertSame(CrawlOutcomeEnum::Success, $result->outcome); + $this->assertSame('My Page Title', $result->title); + } + + public function test_success_extracts_main_text(): void + { + $html = <<<'HTML' + + + Article Title + + +
+

The Real Article

+

This is the main article body that should be extracted by readability.

+

Multiple paragraphs prove the extractor works on the full content.

+
+
Site footer noise
+ + + HTML; + + Http::fake([ + 'example.com/*' => Http::response($html, 200, ['Content-Type' => 'text/html']), + ]); + + $result = $this->makeAction()('https://example.com/article'); + + $this->assertSame(CrawlOutcomeEnum::Success, $result->outcome); + $this->assertNotNull($result->extractedText); + $this->assertStringContainsString('main article body', $result->extractedText); + } + + public function test_success_extracts_and_filters_outbound_links(): void + { + $html = <<<'HTML' + + + Article With Links + + + + + + + HTML; + + Http::fake([ + 'example.com/*' => Http::response($html, 200, ['Content-Type' => 'text/html']), + ]); + + $result = $this->makeAction()('https://example.com/article'); + + $this->assertSame(CrawlOutcomeEnum::Success, $result->outcome); + $this->assertInstanceOf(Collection::class, $result->outboundLinks); + $this->assertSame(2, $result->outboundLinks->count()); + $this->assertContains('https://other.com/article', $result->outboundLinks->all()); + $this->assertContains('https://example.com/related-post', $result->outboundLinks->all()); + $this->assertNotContains('http://192.168.1.1/admin', $result->outboundLinks->all()); + $this->assertNotContains('https://user:pass@evil.com/', $result->outboundLinks->all()); + $this->assertNotContains('ftp://files.example.com/', $result->outboundLinks->all()); + } + + public function test_success_calculates_word_count(): void + { + $html = <<<'HTML' + + + Word Count Test + +
+

This article body has exactly nine words total here.

+
+ + + HTML; + + Http::fake([ + 'example.com/*' => Http::response($html, 200, ['Content-Type' => 'text/html']), + ]); + + $result = $this->makeAction()('https://example.com/article'); + + $this->assertSame(CrawlOutcomeEnum::Success, $result->outcome); + $this->assertSame(9, $result->wordCount); + } + private function makeAction(): FetchPageAction { return app(FetchPageAction::class); From dda5b0f770c7601a7587b225ae6f8d5316b3b41f Mon Sep 17 00:00:00 2001 From: myrmidex Date: Sun, 26 Apr 2026 19:49:08 +0200 Subject: [PATCH 36/65] 12 - Apply pr-reviewer follow-ups: validation, link filters, readonly VO, docs --- app/Actions/FetchPageAction.php | 16 ++-- app/Enums/CrawlOutcomeEnum.php | 7 ++ app/Enums/PageStatusEnum.php | 7 ++ app/ValueObjects/FetchResult.php | 3 +- composer.json | 2 +- composer.lock | 73 +++++++++---------- tests/Feature/Actions/FetchPageActionTest.php | 63 ++++++++++++++++ 7 files changed, 126 insertions(+), 45 deletions(-) diff --git a/app/Actions/FetchPageAction.php b/app/Actions/FetchPageAction.php index 7b98142..ec92a8d 100644 --- a/app/Actions/FetchPageAction.php +++ b/app/Actions/FetchPageAction.php @@ -22,6 +22,7 @@ class FetchPageAction { public function __construct( private Factory $http, + private UrlService $urlService, ) {} public function __invoke(string $url): FetchResult @@ -46,7 +47,7 @@ public function __invoke(string $url): FetchResult if ($outcome === CrawlOutcomeEnum::Success) { [$title, $extractedText, $links] = $this->extractTitleTextAndLinks($response->body(), $url); - $wordCount = count(preg_split('/\s+/u', trim($extractedText))); + $wordCount = $extractedText !== '' ? count(preg_split('/\s+/u', trim($extractedText))) : 0; } return new FetchResult( @@ -64,18 +65,17 @@ public function __invoke(string $url): FetchResult private function validateResponse(Response $response): array { $status = $response->status(); - $statusStart = substr((string) $status, 0, 1); - if ($statusStart === '4') { + if ($status >= 400 && $status < 500) { return [CrawlOutcomeEnum::Blocked4xx, "HTTP {$status}"]; } - if (str_starts_with((string) $status, '5')) { + if ($status >= 500) { return [CrawlOutcomeEnum::Blocked5xx, "HTTP {$status}"]; } $contentType = $response->header('Content-Type'); - if (! str_starts_with($contentType, 'text/html')) { + if (! str_starts_with(mb_strtolower($contentType), 'text/html')) { return [CrawlOutcomeEnum::Rejected, "Unsupported file type: {$contentType}"]; } @@ -147,8 +147,12 @@ private function resolveAndValidateLink(string $href, string $finalUrl): ?string return null; } + if ($resolved === $finalUrl) { + return null; + } + try { - app(UrlService::class)->host($resolved); + $this->urlService->host($resolved); } catch (InvalidArgumentException) { return null; } diff --git a/app/Enums/CrawlOutcomeEnum.php b/app/Enums/CrawlOutcomeEnum.php index c4e912e..949cf69 100644 --- a/app/Enums/CrawlOutcomeEnum.php +++ b/app/Enums/CrawlOutcomeEnum.php @@ -12,5 +12,12 @@ enum CrawlOutcomeEnum: string case BlockedRobots = 'blocked_robots'; case Blocked4xx = 'blocked_4xx'; case Blocked5xx = 'blocked_5xx'; + + /** + * The HTTP fetch succeeded (2xx) but the response is unindexable in v0.1 + * (non-HTML Content-Type). Worker MUST also write `pages.status = Rejected` + * on this outcome — do NOT treat as Failed. Page row STAYS in the DB to + * prevent re-discovery loops as fediverse re-shares the URL. + */ case Rejected = 'rejected'; } diff --git a/app/Enums/PageStatusEnum.php b/app/Enums/PageStatusEnum.php index ed3abe0..84bee4c 100644 --- a/app/Enums/PageStatusEnum.php +++ b/app/Enums/PageStatusEnum.php @@ -9,5 +9,12 @@ enum PageStatusEnum: string case Discovered = 'discovered'; case Fetched = 'fetched'; case Failed = 'failed'; + + /** + * The crawler fetched the page but rejected it as unindexable in v0.1 + * (non-HTML Content-Type). Page row stays as a sentinel preventing + * re-discovery loops; future re-crawl could flip status back to + * Discovered → Fetched if the URL starts serving HTML. + */ case Rejected = 'rejected'; } diff --git a/app/ValueObjects/FetchResult.php b/app/ValueObjects/FetchResult.php index 86e04bb..d79cdae 100644 --- a/app/ValueObjects/FetchResult.php +++ b/app/ValueObjects/FetchResult.php @@ -7,9 +7,10 @@ use App\Enums\CrawlOutcomeEnum; use Illuminate\Support\Collection; -class FetchResult +final readonly class FetchResult { /** + * @param ?string $finalUrl Set to the request URL in v0.1; true post-redirect URL tracking is deferred (see ticket #12 spec). Downstream consumers MUST NOT trust this field as the post-redirect location until that lands. * @param Collection $outboundLinks */ public function __construct( diff --git a/composer.json b/composer.json index 8494562..de1ad17 100644 --- a/composer.json +++ b/composer.json @@ -16,7 +16,7 @@ ], "require": { "php": "^8.3", - "fivefilters/readability.php": "@dev", + "fivefilters/readability.php": "^3.3", "laravel/framework": "^13.0", "laravel/tinker": "^3.0", "livewire/livewire": "^4.2", diff --git a/composer.lock b/composer.lock index 06c83c4..e1fe116 100644 --- a/composer.lock +++ b/composer.lock @@ -4,7 +4,7 @@ "Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies", "This file is @generated automatically" ], - "content-hash": "30d45d9b30092cc20f9364f7c3828aa5", + "content-hash": "2c63ed546b17b144997244f805e8a94a", "packages": [ { "name": "brick/math", @@ -4620,7 +4620,7 @@ }, { "name": "symfony/polyfill-ctype", - "version": "v1.36.0", + "version": "v1.37.0", "source": { "type": "git", "url": "https://github.com/symfony/polyfill-ctype.git", @@ -4679,7 +4679,7 @@ "portable" ], "support": { - "source": "https://github.com/symfony/polyfill-ctype/tree/v1.36.0" + "source": "https://github.com/symfony/polyfill-ctype/tree/v1.37.0" }, "funding": [ { @@ -4703,16 +4703,16 @@ }, { "name": "symfony/polyfill-intl-grapheme", - "version": "v1.36.0", + "version": "v1.37.0", "source": { "type": "git", "url": "https://github.com/symfony/polyfill-intl-grapheme.git", - "reference": "ad1b7b9092976d6c948b8a187cec9faaea9ec1df" + "reference": "4864388bfbd3001ce88e234fab652acd91fdc57e" }, "dist": { "type": "zip", - "url": "https://api.github.com/repos/symfony/polyfill-intl-grapheme/zipball/ad1b7b9092976d6c948b8a187cec9faaea9ec1df", - "reference": "ad1b7b9092976d6c948b8a187cec9faaea9ec1df", + "url": "https://api.github.com/repos/symfony/polyfill-intl-grapheme/zipball/4864388bfbd3001ce88e234fab652acd91fdc57e", + "reference": "4864388bfbd3001ce88e234fab652acd91fdc57e", "shasum": "" }, "require": { @@ -4761,7 +4761,7 @@ "shim" ], "support": { - "source": "https://github.com/symfony/polyfill-intl-grapheme/tree/v1.36.0" + "source": "https://github.com/symfony/polyfill-intl-grapheme/tree/v1.37.0" }, "funding": [ { @@ -4781,11 +4781,11 @@ "type": "tidelift" } ], - "time": "2026-04-10T16:19:22+00:00" + "time": "2026-04-26T13:13:48+00:00" }, { "name": "symfony/polyfill-intl-idn", - "version": "v1.36.0", + "version": "v1.37.0", "source": { "type": "git", "url": "https://github.com/symfony/polyfill-intl-idn.git", @@ -4848,7 +4848,7 @@ "shim" ], "support": { - "source": "https://github.com/symfony/polyfill-intl-idn/tree/v1.36.0" + "source": "https://github.com/symfony/polyfill-intl-idn/tree/v1.37.0" }, "funding": [ { @@ -4872,7 +4872,7 @@ }, { "name": "symfony/polyfill-intl-normalizer", - "version": "v1.36.0", + "version": "v1.37.0", "source": { "type": "git", "url": "https://github.com/symfony/polyfill-intl-normalizer.git", @@ -4933,7 +4933,7 @@ "shim" ], "support": { - "source": "https://github.com/symfony/polyfill-intl-normalizer/tree/v1.36.0" + "source": "https://github.com/symfony/polyfill-intl-normalizer/tree/v1.37.0" }, "funding": [ { @@ -4957,7 +4957,7 @@ }, { "name": "symfony/polyfill-mbstring", - "version": "v1.36.0", + "version": "v1.37.0", "source": { "type": "git", "url": "https://github.com/symfony/polyfill-mbstring.git", @@ -5018,7 +5018,7 @@ "shim" ], "support": { - "source": "https://github.com/symfony/polyfill-mbstring/tree/v1.36.0" + "source": "https://github.com/symfony/polyfill-mbstring/tree/v1.37.0" }, "funding": [ { @@ -5042,7 +5042,7 @@ }, { "name": "symfony/polyfill-php80", - "version": "v1.36.0", + "version": "v1.37.0", "source": { "type": "git", "url": "https://github.com/symfony/polyfill-php80.git", @@ -5102,7 +5102,7 @@ "shim" ], "support": { - "source": "https://github.com/symfony/polyfill-php80/tree/v1.36.0" + "source": "https://github.com/symfony/polyfill-php80/tree/v1.37.0" }, "funding": [ { @@ -5126,7 +5126,7 @@ }, { "name": "symfony/polyfill-php83", - "version": "v1.36.0", + "version": "v1.37.0", "source": { "type": "git", "url": "https://github.com/symfony/polyfill-php83.git", @@ -5182,7 +5182,7 @@ "shim" ], "support": { - "source": "https://github.com/symfony/polyfill-php83/tree/v1.36.0" + "source": "https://github.com/symfony/polyfill-php83/tree/v1.37.0" }, "funding": [ { @@ -5206,7 +5206,7 @@ }, { "name": "symfony/polyfill-php84", - "version": "v1.36.0", + "version": "v1.37.0", "source": { "type": "git", "url": "https://github.com/symfony/polyfill-php84.git", @@ -5262,7 +5262,7 @@ "shim" ], "support": { - "source": "https://github.com/symfony/polyfill-php84/tree/v1.36.0" + "source": "https://github.com/symfony/polyfill-php84/tree/v1.37.0" }, "funding": [ { @@ -5286,16 +5286,16 @@ }, { "name": "symfony/polyfill-php85", - "version": "v1.36.0", + "version": "v1.37.0", "source": { "type": "git", "url": "https://github.com/symfony/polyfill-php85.git", - "reference": "2c408a6bb0313e6001a83628dc5506100474254e" + "reference": "fcfa4973a9917cef23f2e38774da74a2b7d115ee" }, "dist": { "type": "zip", - "url": "https://api.github.com/repos/symfony/polyfill-php85/zipball/2c408a6bb0313e6001a83628dc5506100474254e", - "reference": "2c408a6bb0313e6001a83628dc5506100474254e", + "url": "https://api.github.com/repos/symfony/polyfill-php85/zipball/fcfa4973a9917cef23f2e38774da74a2b7d115ee", + "reference": "fcfa4973a9917cef23f2e38774da74a2b7d115ee", "shasum": "" }, "require": { @@ -5342,7 +5342,7 @@ "shim" ], "support": { - "source": "https://github.com/symfony/polyfill-php85/tree/v1.36.0" + "source": "https://github.com/symfony/polyfill-php85/tree/v1.37.0" }, "funding": [ { @@ -5362,11 +5362,11 @@ "type": "tidelift" } ], - "time": "2026-04-10T16:50:15+00:00" + "time": "2026-04-26T13:10:57+00:00" }, { "name": "symfony/polyfill-uuid", - "version": "v1.36.0", + "version": "v1.37.0", "source": { "type": "git", "url": "https://github.com/symfony/polyfill-uuid.git", @@ -5425,7 +5425,7 @@ "uuid" ], "support": { - "source": "https://github.com/symfony/polyfill-uuid/tree/v1.36.0" + "source": "https://github.com/symfony/polyfill-uuid/tree/v1.37.0" }, "funding": [ { @@ -6263,16 +6263,16 @@ }, { "name": "voku/portable-ascii", - "version": "2.1.0", + "version": "2.1.1", "source": { "type": "git", "url": "https://github.com/voku/portable-ascii.git", - "reference": "d870a33f0f79d2b4579740b0620200221ee44aeb" + "reference": "8e1051fe39379367aecf014f41744ce7539a856f" }, "dist": { "type": "zip", - "url": "https://api.github.com/repos/voku/portable-ascii/zipball/d870a33f0f79d2b4579740b0620200221ee44aeb", - "reference": "d870a33f0f79d2b4579740b0620200221ee44aeb", + "url": "https://api.github.com/repos/voku/portable-ascii/zipball/8e1051fe39379367aecf014f41744ce7539a856f", + "reference": "8e1051fe39379367aecf014f41744ce7539a856f", "shasum": "" }, "require": { @@ -6309,7 +6309,7 @@ ], "support": { "issues": "https://github.com/voku/portable-ascii/issues", - "source": "https://github.com/voku/portable-ascii/tree/2.1.0" + "source": "https://github.com/voku/portable-ascii/tree/2.1.1" }, "funding": [ { @@ -6333,7 +6333,7 @@ "type": "tidelift" } ], - "time": "2026-04-16T23:10:39+00:00" + "time": "2026-04-26T05:33:54+00:00" } ], "packages-dev": [ @@ -8649,7 +8649,6 @@ "aliases": [], "minimum-stability": "stable", "stability-flags": { - "fivefilters/readability.php": 20, "lvl0/fedi-discover": 20 }, "prefer-stable": true, @@ -8658,5 +8657,5 @@ "php": "^8.3" }, "platform-dev": {}, - "plugin-api-version": "2.6.0" + "plugin-api-version": "2.9.0" } diff --git a/tests/Feature/Actions/FetchPageActionTest.php b/tests/Feature/Actions/FetchPageActionTest.php index 6925e96..b5f415a 100644 --- a/tests/Feature/Actions/FetchPageActionTest.php +++ b/tests/Feature/Actions/FetchPageActionTest.php @@ -260,6 +260,69 @@ public function test_success_calculates_word_count(): void $this->assertSame(9, $result->wordCount); } + public function test_uppercase_content_type_is_accepted_as_html(): void + { + Http::fake([ + 'example.com/*' => Http::response( + 'Uppercase CT

Content here.

', + 200, + ['Content-Type' => 'Text/HTML; charset=utf-8'], + ), + ]); + + $result = $this->makeAction()('https://example.com/page'); + + $this->assertSame(CrawlOutcomeEnum::Success, $result->outcome); + } + + public function test_empty_href_is_filtered_from_outbound_links(): void + { + $html = <<<'HTML' + + + Empty Href Test + + + + + HTML; + + Http::fake([ + 'example.com/*' => Http::response($html, 200, ['Content-Type' => 'text/html']), + ]); + + $result = $this->makeAction()('https://example.com/article'); + + $this->assertSame(CrawlOutcomeEnum::Success, $result->outcome); + $this->assertSame(0, $result->outboundLinks->count()); + } + + public function test_fragment_only_href_is_filtered_from_outbound_links(): void + { + $html = <<<'HTML' + + + Fragment Href Test + + + + + HTML; + + Http::fake([ + 'example.com/*' => Http::response($html, 200, ['Content-Type' => 'text/html']), + ]); + + $result = $this->makeAction()('https://example.com/article'); + + $this->assertSame(CrawlOutcomeEnum::Success, $result->outcome); + $this->assertSame(0, $result->outboundLinks->count()); + } + private function makeAction(): FetchPageAction { return app(FetchPageAction::class); From 649aeb362790f592c6d68244a985809b3026d8ef Mon Sep 17 00:00:00 2001 From: myrmidex Date: Sun, 26 Apr 2026 20:18:18 +0200 Subject: [PATCH 37/65] chore - Extract RegisterDiscoveredPageAction for shared Page::firstOrCreate logic --- app/Actions/RegisterDiscoveredPageAction.php | 22 +++++ app/Listeners/UrlDiscoveredListener.php | 17 ++-- app/Livewire/UrlSubmissionForm.php | 10 +-- .../RegisterDiscoveredPageActionTest.php | 83 +++++++++++++++++++ 4 files changed, 115 insertions(+), 17 deletions(-) create mode 100644 app/Actions/RegisterDiscoveredPageAction.php create mode 100644 tests/Unit/Actions/RegisterDiscoveredPageActionTest.php diff --git a/app/Actions/RegisterDiscoveredPageAction.php b/app/Actions/RegisterDiscoveredPageAction.php new file mode 100644 index 0000000..840e52c --- /dev/null +++ b/app/Actions/RegisterDiscoveredPageAction.php @@ -0,0 +1,22 @@ + $url], + [ + 'status' => PageStatusEnum::Discovered, + 'instance_id' => $instanceId, + ], + ); + } +} diff --git a/app/Listeners/UrlDiscoveredListener.php b/app/Listeners/UrlDiscoveredListener.php index 535951a..67b4f1f 100644 --- a/app/Listeners/UrlDiscoveredListener.php +++ b/app/Listeners/UrlDiscoveredListener.php @@ -4,8 +4,7 @@ namespace App\Listeners; -use App\Enums\PageStatusEnum; -use App\Models\Page; +use App\Actions\RegisterDiscoveredPageAction; use App\Models\PageLink; use Illuminate\Contracts\Queue\ShouldQueue; use Illuminate\Support\Facades\DB; @@ -13,22 +12,20 @@ class UrlDiscoveredListener implements ShouldQueue { + public function __construct( + private RegisterDiscoveredPageAction $registerPage, + ) {} + public function handle(UrlDiscovered $event): void { DB::transaction(function () use ($event) { - $targetPage = Page::firstOrCreate( - ['url' => $event->url], - ['status' => PageStatusEnum::Discovered, 'instance_id' => $event->instanceId], - ); + $targetPage = ($this->registerPage)($event->url, $event->instanceId); if ($event->postUrl === null || $event->postUrl === $event->url) { return; } - $sourcePage = Page::firstOrCreate( - ['url' => $event->postUrl], - ['status' => PageStatusEnum::Discovered, 'instance_id' => $event->instanceId], - ); + $sourcePage = ($this->registerPage)($event->postUrl, $event->instanceId); PageLink::firstOrCreate([ 'source_page_id' => $sourcePage->id, diff --git a/app/Livewire/UrlSubmissionForm.php b/app/Livewire/UrlSubmissionForm.php index bbf2000..8c1b11e 100644 --- a/app/Livewire/UrlSubmissionForm.php +++ b/app/Livewire/UrlSubmissionForm.php @@ -4,8 +4,7 @@ namespace App\Livewire; -use App\Enums\PageStatusEnum; -use App\Models\Page; +use App\Actions\RegisterDiscoveredPageAction; use Illuminate\Contracts\View\View; use Illuminate\Support\Facades\RateLimiter; use Livewire\Component; @@ -16,7 +15,7 @@ class UrlSubmissionForm extends Component public ?string $confirmedUrl = null; - public function submit(): void + public function submit(RegisterDiscoveredPageAction $registerPage): void { $key = 'submit-url:' . request()->ip(); @@ -32,10 +31,7 @@ public function submit(): void 'url' => ['required', 'url:http,https'], ]); - Page::firstOrCreate( - ['url' => $validated['url']], - ['status' => PageStatusEnum::Discovered], - ); + $registerPage($validated['url']); $this->confirmedUrl = $validated['url']; $this->reset('url'); diff --git a/tests/Unit/Actions/RegisterDiscoveredPageActionTest.php b/tests/Unit/Actions/RegisterDiscoveredPageActionTest.php new file mode 100644 index 0000000..f993940 --- /dev/null +++ b/tests/Unit/Actions/RegisterDiscoveredPageActionTest.php @@ -0,0 +1,83 @@ +assertInstanceOf(Page::class, $page); + $this->assertSame('https://example.com/article', $page->url); + $this->assertSame(PageStatusEnum::Discovered, $page->status); + $this->assertNull($page->instance_id); + $this->assertDatabaseHas('pages', ['url' => 'https://example.com/article']); + } + + public function test_creates_page_with_provided_instance_id(): void + { + $instance = Instance::factory() + ->type(InstanceType::Mastodon) + ->enabled() + ->create(); + + $action = new RegisterDiscoveredPageAction; + + $page = $action('https://example.com/fediverse-post', instanceId: $instance->id); + + $this->assertInstanceOf(Page::class, $page); + $this->assertSame($instance->id, $page->instance_id); + $this->assertDatabaseHas('pages', [ + 'url' => 'https://example.com/fediverse-post', + 'instance_id' => $instance->id, + ]); + } + + public function test_returns_existing_page_when_url_already_exists(): void + { + $existing = Page::factory()->createQuietly([ + 'url' => 'https://example.com/seen-before', + 'status' => PageStatusEnum::Discovered, + ]); + + $action = new RegisterDiscoveredPageAction; + + $returned = $action('https://example.com/seen-before'); + + $this->assertSame($existing->id, $returned->id); + $this->assertDatabaseCount('pages', 1); + } + + public function test_existing_page_status_not_overwritten_on_duplicate_call(): void + { + Page::factory()->createQuietly([ + 'url' => 'https://example.com/already-fetched', + 'status' => PageStatusEnum::Fetched, + ]); + + $action = new RegisterDiscoveredPageAction; + + $returned = $action('https://example.com/already-fetched'); + + $this->assertSame(PageStatusEnum::Fetched, $returned->status); + $this->assertDatabaseHas('pages', [ + 'url' => 'https://example.com/already-fetched', + 'status' => PageStatusEnum::Fetched, + ]); + } +} From 6c0e1fe12d143de2f6cb14b401247ff38a1b45fd Mon Sep 17 00:00:00 2001 From: myrmidex Date: Sun, 26 Apr 2026 20:22:24 +0200 Subject: [PATCH 38/65] chore - Simplify call-site tests now that RegisterDiscoveredPageAction is unit-tested --- tests/Feature/UrlDiscoveryTest.php | 8 -------- tests/Feature/UrlSubmissionTest.php | 2 -- 2 files changed, 10 deletions(-) diff --git a/tests/Feature/UrlDiscoveryTest.php b/tests/Feature/UrlDiscoveryTest.php index 9d0782f..c616132 100644 --- a/tests/Feature/UrlDiscoveryTest.php +++ b/tests/Feature/UrlDiscoveryTest.php @@ -4,7 +4,6 @@ namespace Tests\Feature; -use App\Enums\PageStatusEnum; use App\Listeners\UrlDiscoveredListener; use App\Models\Page; use App\Models\PageLink; @@ -66,15 +65,10 @@ public function test_listener_creates_target_page_and_source_page_with_link(): v // Target page $targetPage = Page::where('url', 'https://example-blog.com/article')->first(); $this->assertNotNull($targetPage); - $this->assertSame(PageStatusEnum::Discovered, $targetPage->status); - $this->assertSame($instance->id, $targetPage->instance_id); // Source page $sourcePage = Page::where('url', 'https://mastodon.social/@alice/109876543210')->first(); $this->assertNotNull($sourcePage); - $this->assertSame(PageStatusEnum::Discovered, $sourcePage->status); - $this->assertSame($instance->id, $sourcePage->instance_id); - $this->assertNull($sourcePage->fetched_at); // Edge $link = PageLink::where('source_page_id', $sourcePage->id) @@ -115,8 +109,6 @@ public function test_listener_with_null_post_url_creates_only_target_page(): voi $targetPage = Page::where('url', 'https://example-blog.com/article')->first(); $this->assertNotNull($targetPage); - $this->assertSame(PageStatusEnum::Discovered, $targetPage->status); - $this->assertSame($instance->id, $targetPage->instance_id); } // --------------------------------------------------------------------------- diff --git a/tests/Feature/UrlSubmissionTest.php b/tests/Feature/UrlSubmissionTest.php index bc76e44..ff85245 100644 --- a/tests/Feature/UrlSubmissionTest.php +++ b/tests/Feature/UrlSubmissionTest.php @@ -41,8 +41,6 @@ public function test_valid_url_submission_creates_page_as_discovered(): void $this->assertDatabaseHas('pages', [ 'url' => 'https://example.com/interesting-post', - 'status' => PageStatusEnum::Discovered, - 'instance_id' => null, ]); } From 118de0023ae95e26d02efd74ef03aa6ffd91e996 Mon Sep 17 00:00:00 2001 From: myrmidex Date: Sun, 26 Apr 2026 20:58:07 +0200 Subject: [PATCH 39/65] 14 - Simplify page_crawls schema (queue moves to Redis) --- app/Models/PageCrawl.php | 4 ---- database/factories/PageCrawlFactory.php | 18 ------------------ ..._04_26_111140_create_page_crawls_table.php | 13 ------------- tests/Feature/PageQueuePopulationTest.php | 1 - tests/Unit/Models/PageCrawlFactoryTest.php | 19 ------------------- tests/Unit/Models/PageCrawlTest.php | 9 --------- 6 files changed, 64 deletions(-) diff --git a/app/Models/PageCrawl.php b/app/Models/PageCrawl.php index 8568bee..aa6f77a 100644 --- a/app/Models/PageCrawl.php +++ b/app/Models/PageCrawl.php @@ -19,20 +19,16 @@ class PageCrawl extends Model 'page_id', 'domain', 'priority', - 'scheduled_for', 'completed_at', 'outcome', 'status_code', 'error_message', - 'locked_at', ]; protected $casts = [ 'priority' => 'integer', - 'scheduled_for' => 'datetime', 'completed_at' => 'datetime', 'outcome' => CrawlOutcomeEnum::class, - 'locked_at' => 'datetime', 'status_code' => 'integer', ]; diff --git a/database/factories/PageCrawlFactory.php b/database/factories/PageCrawlFactory.php index 80c6f7c..cdd6289 100644 --- a/database/factories/PageCrawlFactory.php +++ b/database/factories/PageCrawlFactory.php @@ -7,7 +7,6 @@ use App\Enums\CrawlOutcomeEnum; use App\Models\Page; use App\Models\PageCrawl; -use Carbon\Carbon; use Illuminate\Database\Eloquent\Factories\Factory; /** @@ -21,12 +20,10 @@ public function definition(): array 'page_id' => null, 'domain' => 'example.com', 'priority' => 0, - 'scheduled_for' => now(), 'completed_at' => null, 'outcome' => null, 'status_code' => null, 'error_message' => null, - 'locked_at' => null, ]; } @@ -53,19 +50,4 @@ public function failed(string $errorMessage): static 'error_message' => $errorMessage, ]); } - - public function scheduledAt(Carbon $scheduledAt): static - { - return $this->state(fn () => [ - 'scheduled_for' => $scheduledAt, - ]); - } - - public function locked(): static - { - return $this->state(fn () => [ - 'locked_at' => now(), - 'outcome' => null, - ]); - } } diff --git a/database/migrations/2026_04_26_111140_create_page_crawls_table.php b/database/migrations/2026_04_26_111140_create_page_crawls_table.php index b423f25..9e18d9a 100644 --- a/database/migrations/2026_04_26_111140_create_page_crawls_table.php +++ b/database/migrations/2026_04_26_111140_create_page_crawls_table.php @@ -4,7 +4,6 @@ use Illuminate\Database\Migrations\Migration; use Illuminate\Database\Schema\Blueprint; -use Illuminate\Support\Facades\DB; use Illuminate\Support\Facades\Schema; return new class extends Migration @@ -18,8 +17,6 @@ public function up(): void ->cascadeOnDelete(); $table->string('domain'); $table->smallInteger('priority')->default(0); - $table->timestampTz('scheduled_for')->useCurrent(); - $table->timestampTz('locked_at')->nullable(); $table->timestampTz('completed_at')->nullable(); $table->string('outcome')->nullable(); $table->smallInteger('status_code')->nullable(); @@ -28,16 +25,6 @@ public function up(): void $table->index(['page_id', 'created_at']); }); - - if (DB::getDriverName() === 'pgsql') { - DB::statement('CREATE INDEX page_crawls_pending_domain_idx ON page_crawls (domain) WHERE outcome IS NULL'); - DB::statement('CREATE INDEX page_crawls_pending_poll_idx ON page_crawls (scheduled_for, locked_at) WHERE outcome IS NULL'); - } else { - Schema::table('page_crawls', function (Blueprint $table) { - $table->index('domain'); - $table->index(['scheduled_for', 'locked_at']); - }); - } } public function down(): void diff --git a/tests/Feature/PageQueuePopulationTest.php b/tests/Feature/PageQueuePopulationTest.php index 4204799..6addcd0 100644 --- a/tests/Feature/PageQueuePopulationTest.php +++ b/tests/Feature/PageQueuePopulationTest.php @@ -30,7 +30,6 @@ public function test_creating_a_page_inserts_a_page_crawl_row(): void $crawl = PageCrawl::where('page_id', $page->id)->first(); $this->assertNotNull($crawl); - $this->assertNotNull($crawl->scheduled_for); } public function test_created_page_crawl_has_null_outcome(): void diff --git a/tests/Unit/Models/PageCrawlFactoryTest.php b/tests/Unit/Models/PageCrawlFactoryTest.php index 65d29cc..21990fa 100644 --- a/tests/Unit/Models/PageCrawlFactoryTest.php +++ b/tests/Unit/Models/PageCrawlFactoryTest.php @@ -34,23 +34,4 @@ public function test_factory_failed_state_produces_failed_outcome_with_message() $this->assertInstanceOf(Carbon::class, $crawl->completed_at); $this->assertSame('Connection timed out', $crawl->error_message); } - - public function test_factory_locked_state_produces_in_flight_crawl(): void - { - $page = Page::factory()->create(); - $crawl = PageCrawl::factory()->page($page)->locked()->create(); - - $this->assertInstanceOf(Carbon::class, $crawl->locked_at); - $this->assertNull($crawl->completed_at); - $this->assertNull($crawl->outcome); - } - - public function test_factory_scheduled_at_state_overrides_default_scheduled_for(): void - { - $page = Page::factory()->create(); - $timestamp = Carbon::parse('2026-05-01 10:00:00'); - $crawl = PageCrawl::factory()->page($page)->scheduledAt($timestamp)->create(); - - $this->assertTrue($timestamp->equalTo($crawl->scheduled_for)); - } } diff --git a/tests/Unit/Models/PageCrawlTest.php b/tests/Unit/Models/PageCrawlTest.php index 4c19294..73fdad0 100644 --- a/tests/Unit/Models/PageCrawlTest.php +++ b/tests/Unit/Models/PageCrawlTest.php @@ -19,16 +19,12 @@ public function test_page_crawl_fillable_fields_persist_and_casts_are_applied(): { $page = Page::factory()->createQuietly(['url' => 'https://example.com/page-1']); - $scheduledFor = Carbon::parse('2026-05-01 10:00:00'); - $lockedAt = Carbon::parse('2026-05-01 10:01:00'); $completedAt = Carbon::parse('2026-05-01 10:01:05'); $crawl = PageCrawl::create([ 'page_id' => $page->id, 'domain' => 'example.com', 'priority' => 5, - 'scheduled_for' => $scheduledFor, - 'locked_at' => $lockedAt, 'completed_at' => $completedAt, 'outcome' => CrawlOutcomeEnum::Success, 'status_code' => 200, @@ -48,12 +44,8 @@ public function test_page_crawl_fillable_fields_persist_and_casts_are_applied(): $this->assertSame(CrawlOutcomeEnum::Success, $fresh->outcome); // datetime casts - $this->assertInstanceOf(Carbon::class, $fresh->scheduled_for); - $this->assertInstanceOf(Carbon::class, $fresh->locked_at); $this->assertInstanceOf(Carbon::class, $fresh->completed_at); - $this->assertTrue($scheduledFor->equalTo($fresh->scheduled_for)); - $this->assertTrue($lockedAt->equalTo($fresh->locked_at)); $this->assertTrue($completedAt->equalTo($fresh->completed_at)); // nullable columns @@ -71,7 +63,6 @@ public function test_page_crawl_belongs_to_a_page(): void 'page_id' => $page->id, 'domain' => 'example.com', 'priority' => 1, - 'scheduled_for' => Carbon::now(), ]); $related = $crawl->page; From 2a586ecac4dfcb7cdc0e15a7c4b6a25a2c51f66d Mon Sep 17 00:00:00 2001 From: myrmidex Date: Sun, 26 Apr 2026 21:15:07 +0200 Subject: [PATCH 40/65] 14 - Add PageCrawlObserver and ProcessCrawlJob skeleton --- app/Jobs/ProcessCrawlJob.php | 21 ++++++++++++ app/Models/PageCrawl.php | 3 ++ app/Observers/PageCrawlObserver.php | 14 ++++++++ tests/Feature/Jobs/ProcessCrawlJobTest.php | 40 ++++++++++++++++++++++ 4 files changed, 78 insertions(+) create mode 100644 app/Jobs/ProcessCrawlJob.php create mode 100644 app/Observers/PageCrawlObserver.php create mode 100644 tests/Feature/Jobs/ProcessCrawlJobTest.php diff --git a/app/Jobs/ProcessCrawlJob.php b/app/Jobs/ProcessCrawlJob.php new file mode 100644 index 0000000..5c55b67 --- /dev/null +++ b/app/Jobs/ProcessCrawlJob.php @@ -0,0 +1,21 @@ + */ diff --git a/app/Observers/PageCrawlObserver.php b/app/Observers/PageCrawlObserver.php new file mode 100644 index 0000000..85a8517 --- /dev/null +++ b/app/Observers/PageCrawlObserver.php @@ -0,0 +1,14 @@ +createQuietly(['url' => 'https://example.com/article']); + PageCrawl::factory()->page($page)->create(); + + Queue::assertPushed(ProcessCrawlJob::class); + } + + public function test_dispatched_job_carries_the_correct_page_crawl(): void + { + Queue::fake(); + + $page = Page::factory()->createQuietly(['url' => 'https://example.com/article']); + $crawl = PageCrawl::factory()->page($page)->create(); + + Queue::assertPushed( + ProcessCrawlJob::class, + fn (ProcessCrawlJob $job) => $job->pageCrawl->id === $crawl->id, + ); + } +} From 720e4bcc1feb1b0083e07bd75b54eceddf85ed59 Mon Sep 17 00:00:00 2001 From: myrmidex Date: Sun, 26 Apr 2026 23:50:57 +0200 Subject: [PATCH 41/65] 14 - Implement ProcessCrawlJob orchestration with retry logic --- app/Jobs/ProcessCrawlJob.php | 67 +++++- app/Models/PageCrawl.php | 3 + tests/Feature/Jobs/ProcessCrawlJobTest.php | 259 +++++++++++++++++++++ tests/Feature/PageQueuePopulationTest.php | 10 - tests/Unit/Models/PageCrawlFactoryTest.php | 5 + tests/Unit/Models/PageCrawlTest.php | 5 + tests/Unit/Models/PageTest.php | 7 + 7 files changed, 344 insertions(+), 12 deletions(-) diff --git a/app/Jobs/ProcessCrawlJob.php b/app/Jobs/ProcessCrawlJob.php index 5c55b67..a8b4513 100644 --- a/app/Jobs/ProcessCrawlJob.php +++ b/app/Jobs/ProcessCrawlJob.php @@ -2,7 +2,12 @@ namespace App\Jobs; +use App\Actions\FetchPageAction; +use App\Actions\RegisterDiscoveredPageAction; +use App\Enums\CrawlOutcomeEnum; +use App\Enums\PageStatusEnum; use App\Models\PageCrawl; +use App\ValueObjects\FetchResult; use Illuminate\Contracts\Queue\ShouldQueue; use Illuminate\Foundation\Queue\Queueable; @@ -14,8 +19,66 @@ public function __construct( public PageCrawl $pageCrawl, ) {} - public function handle(): void + public function handle( + FetchPageAction $fetcher, + RegisterDiscoveredPageAction $register, + ): void { + /** @var FetchResult $result */ + $result = $fetcher($this->pageCrawl->page->url); + + $this->pageCrawl->update([ + 'outcome' => CrawlOutcomeEnum::Success, + 'completed_at' => now(), + 'status_code' => 200, + ]); + + $update = match ($result->outcome) { + CrawlOutcomeEnum::Rejected => [ + 'status' => PageStatusEnum::Rejected, + 'fetched_at' => null, + ], + CrawlOutcomeEnum::Timeout => [ + 'status' => PageStatusEnum::Failed, + 'failed_at' => now(), + ], + CrawlOutcomeEnum::Blocked4xx => [ + 'status' => PageStatusEnum::Failed, + 'failed_at' => now(), + ], + default => [ + 'status' => PageStatusEnum::Fetched, + 'fetched_at' => now(), + 'title' => $result->title, + ], + }; + + $result->outboundLinks->each(fn (string $url) => $register($url)); + + $this->pageCrawl->page->update($update); + + if (in_array($result->outcome, [ + CrawlOutcomeEnum::Failed, + CrawlOutcomeEnum::Timeout, + CrawlOutcomeEnum::Blocked5xx, + ])) { + $this->scheduleRetryIfNeeded($result, $this->pageCrawl); + } + } + + private function scheduleRetryIfNeeded(FetchResult $result, PageCrawl $crawl): void { - // + if (PageCrawl::where('page_id', $crawl->page_id)->count() >= 3) { + return; + } + + $newRow = PageCrawl::withoutEvents( + fn () => PageCrawl::create( + array_merge($crawl->toArray(), [ + 'outcome' => null, + ]) + ) + ); + + ProcessCrawlJob::dispatch($newRow)->delay(now()->addHour()); } } diff --git a/app/Models/PageCrawl.php b/app/Models/PageCrawl.php index 27cfe40..ba2ba29 100644 --- a/app/Models/PageCrawl.php +++ b/app/Models/PageCrawl.php @@ -35,6 +35,9 @@ class PageCrawl extends Model 'status_code' => 'integer', ]; + /** + * @return BelongsTo + */ public function page(): BelongsTo { return $this->belongsTo(Page::class); diff --git a/tests/Feature/Jobs/ProcessCrawlJobTest.php b/tests/Feature/Jobs/ProcessCrawlJobTest.php index 98c632b..8503089 100644 --- a/tests/Feature/Jobs/ProcessCrawlJobTest.php +++ b/tests/Feature/Jobs/ProcessCrawlJobTest.php @@ -4,11 +4,18 @@ namespace Tests\Feature\Jobs; +use App\Actions\FetchPageAction; +use App\Actions\RegisterDiscoveredPageAction; +use App\Enums\CrawlOutcomeEnum; +use App\Enums\PageStatusEnum; use App\Jobs\ProcessCrawlJob; use App\Models\Page; use App\Models\PageCrawl; +use App\ValueObjects\FetchResult; +use Carbon\Carbon; use Illuminate\Foundation\Testing\RefreshDatabase; use Illuminate\Support\Facades\Queue; +use Mockery; use Tests\TestCase; class ProcessCrawlJobTest extends TestCase @@ -37,4 +44,256 @@ public function test_dispatched_job_carries_the_correct_page_crawl(): void fn (ProcessCrawlJob $job) => $job->pageCrawl->id === $crawl->id, ); } + + public function test_handle_writes_outcome_to_page_crawl_on_success(): void + { + Queue::fake(); + + $fetcher = Mockery::mock(FetchPageAction::class); + $fetcher->shouldReceive('__invoke')->andReturn(new FetchResult( + outcome: CrawlOutcomeEnum::Success, + statusCode: 200, + finalUrl: 'https://example.com/article', + title: 'Hello', + extractedText: 'hi', + outboundLinks: collect(), + wordCount: 1, + errorMessage: null, + )); + $this->app->instance(FetchPageAction::class, $fetcher); + + $page = Page::factory()->createQuietly(['url' => 'https://example.com/article']); + $crawl = PageCrawl::factory()->page($page)->createQuietly(); + + app(ProcessCrawlJob::class, ['pageCrawl' => $crawl]) + ->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class)); + + $fresh = $crawl->fresh(); + $this->assertSame(CrawlOutcomeEnum::Success, $fresh->outcome); + $this->assertNotNull($fresh->completed_at); + $this->assertInstanceOf(Carbon::class, $fresh->completed_at); + $this->assertSame(200, $fresh->status_code); + $this->assertNull($fresh->error_message); + } + + public function test_handle_updates_page_to_fetched_on_success(): void + { + Queue::fake(); + + $fetcher = Mockery::mock(FetchPageAction::class); + $fetcher->shouldReceive('__invoke')->andReturn(new FetchResult( + outcome: CrawlOutcomeEnum::Success, + statusCode: 200, + finalUrl: 'https://example.com/article', + title: 'Hello', + extractedText: 'hi', + outboundLinks: collect(), + wordCount: 1, + errorMessage: null, + )); + $this->app->instance(FetchPageAction::class, $fetcher); + + $page = Page::factory()->createQuietly(['url' => 'https://example.com/article']); + $crawl = PageCrawl::factory()->page($page)->createQuietly(); + + app(ProcessCrawlJob::class, ['pageCrawl' => $crawl]) + ->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class)); + + $fresh = $page->fresh(); + $this->assertSame(PageStatusEnum::Fetched, $fresh->status); + $this->assertNotNull($fresh->fetched_at); + $this->assertInstanceOf(Carbon::class, $fresh->fetched_at); + $this->assertSame('Hello', $fresh->title); + } + + public function test_handle_updates_page_to_rejected_on_rejected_outcome(): void + { + Queue::fake(); + + $fetcher = Mockery::mock(FetchPageAction::class); + $fetcher->shouldReceive('__invoke')->andReturn(new FetchResult( + outcome: CrawlOutcomeEnum::Rejected, + statusCode: 200, + finalUrl: null, + title: null, + extractedText: null, + outboundLinks: collect(), + wordCount: null, + errorMessage: 'Unsupported Content-Type: application/pdf', + )); + $this->app->instance(FetchPageAction::class, $fetcher); + + $page = Page::factory()->createQuietly(['url' => 'https://example.com/brochure.pdf']); + $crawl = PageCrawl::factory()->page($page)->createQuietly(); + + app(ProcessCrawlJob::class, ['pageCrawl' => $crawl]) + ->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class)); + + $fresh = $page->fresh(); + $this->assertSame(PageStatusEnum::Rejected, $fresh->status); + $this->assertNull($fresh->fetched_at); + } + + public function test_handle_updates_page_to_failed_on_blocked_4xx(): void + { + Queue::fake(); + + $fetcher = Mockery::mock(FetchPageAction::class); + $fetcher->shouldReceive('__invoke')->andReturn(new FetchResult( + outcome: CrawlOutcomeEnum::Blocked4xx, + statusCode: 404, + finalUrl: null, + title: null, + extractedText: null, + outboundLinks: collect(), + wordCount: null, + errorMessage: 'HTTP 404', + )); + $this->app->instance(FetchPageAction::class, $fetcher); + + $page = Page::factory()->createQuietly(['url' => 'https://example.com/gone']); + $crawl = PageCrawl::factory()->page($page)->createQuietly(); + + app(ProcessCrawlJob::class, ['pageCrawl' => $crawl]) + ->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class)); + + $fresh = $page->fresh(); + $this->assertSame(PageStatusEnum::Failed, $fresh->status); + $this->assertNotNull($fresh->failed_at); + $this->assertInstanceOf(Carbon::class, $fresh->failed_at); + } + + public function test_handle_updates_page_to_failed_on_timeout(): void + { + Queue::fake(); + + $fetcher = Mockery::mock(FetchPageAction::class); + $fetcher->shouldReceive('__invoke')->andReturn(new FetchResult( + outcome: CrawlOutcomeEnum::Timeout, + statusCode: null, + finalUrl: null, + title: null, + extractedText: null, + outboundLinks: collect(), + wordCount: null, + errorMessage: 'Connection timed out after 10 seconds', + )); + $this->app->instance(FetchPageAction::class, $fetcher); + + $page = Page::factory()->createQuietly(['url' => 'https://example.com/slow']); + $crawl = PageCrawl::factory()->page($page)->createQuietly(); + + app(ProcessCrawlJob::class, ['pageCrawl' => $crawl]) + ->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class)); + + $fresh = $page->fresh(); + $this->assertSame(PageStatusEnum::Failed, $fresh->status); + $this->assertNotNull($fresh->failed_at); + $this->assertInstanceOf(Carbon::class, $fresh->failed_at); + } + + public function test_handle_schedules_retry_on_transient_failure(): void + { + Queue::fake(); + + $fetcher = Mockery::mock(FetchPageAction::class); + $fetcher->shouldReceive('__invoke')->andReturn(new FetchResult( + outcome: CrawlOutcomeEnum::Failed, + statusCode: null, + finalUrl: null, + title: null, + extractedText: null, + outboundLinks: collect(), + wordCount: null, + errorMessage: 'Connection refused', + )); + $this->app->instance(FetchPageAction::class, $fetcher); + + $page = Page::factory()->createQuietly(['url' => 'https://example.com/unstable']); + $crawl = PageCrawl::factory()->page($page)->createQuietly(); + + app(ProcessCrawlJob::class, ['pageCrawl' => $crawl]) + ->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class)); + + // A second PageCrawl row (the retry) must have been inserted for the same page + $this->assertSame(2, PageCrawl::where('page_id', $page->id)->count()); + + // The new row is pending — outcome IS NULL + $retryRow = PageCrawl::where('page_id', $page->id) + ->whereNull('outcome') + ->first(); + $this->assertNotNull($retryRow); + + // A delayed ProcessCrawlJob must have been pushed for the retry row + Queue::assertPushed( + ProcessCrawlJob::class, + fn (ProcessCrawlJob $job) => $job->pageCrawl->page_id === $page->id + && $job->pageCrawl->id === $retryRow->id, + ); + } + + public function test_handle_does_not_retry_after_three_attempts(): void + { + Queue::fake(); + + $fetcher = Mockery::mock(FetchPageAction::class); + $fetcher->shouldReceive('__invoke')->andReturn(new FetchResult( + outcome: CrawlOutcomeEnum::Failed, + statusCode: null, + finalUrl: null, + title: null, + extractedText: null, + outboundLinks: collect(), + wordCount: null, + errorMessage: 'Connection refused', + )); + $this->app->instance(FetchPageAction::class, $fetcher); + + $page = Page::factory()->createQuietly(['url' => 'https://example.com/unreachable']); + + // 3 prior attempts already exist — this is the cap + PageCrawl::factory()->page($page)->failed('Connection refused')->createQuietly(); + PageCrawl::factory()->page($page)->failed('Connection refused')->createQuietly(); + $thirdCrawl = PageCrawl::factory()->page($page)->createQuietly(); + + app(ProcessCrawlJob::class, ['pageCrawl' => $thirdCrawl]) + ->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class)); + + // No 4th row must appear — retry cap reached + $this->assertSame(3, PageCrawl::where('page_id', $page->id)->count()); + + // No retry job dispatched + Queue::assertNotPushed(ProcessCrawlJob::class); + } + + public function test_handle_registers_outbound_links_on_success(): void + { + Queue::fake(); + + $fetcher = Mockery::mock(FetchPageAction::class); + $fetcher->shouldReceive('__invoke')->andReturn(new FetchResult( + outcome: CrawlOutcomeEnum::Success, + statusCode: 200, + finalUrl: 'https://source.com/article', + title: 'Source Article', + extractedText: 'some text', + outboundLinks: collect([ + 'https://other.com/article-1', + 'https://another.com/post-2', + ]), + wordCount: 2, + errorMessage: null, + )); + $this->app->instance(FetchPageAction::class, $fetcher); + + $page = Page::factory()->createQuietly(['url' => 'https://source.com/article']); + $crawl = PageCrawl::factory()->page($page)->createQuietly(); + + app(ProcessCrawlJob::class, ['pageCrawl' => $crawl]) + ->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class)); + + $this->assertDatabaseHas('pages', ['url' => 'https://other.com/article-1']); + $this->assertDatabaseHas('pages', ['url' => 'https://another.com/post-2']); + $this->assertSame(3, Page::count()); + } } diff --git a/tests/Feature/PageQueuePopulationTest.php b/tests/Feature/PageQueuePopulationTest.php index 6addcd0..943d79c 100644 --- a/tests/Feature/PageQueuePopulationTest.php +++ b/tests/Feature/PageQueuePopulationTest.php @@ -32,16 +32,6 @@ public function test_creating_a_page_inserts_a_page_crawl_row(): void $this->assertNotNull($crawl); } - public function test_created_page_crawl_has_null_outcome(): void - { - $page = Page::factory()->create(['url' => 'https://example-blog.com/article']); - - $crawl = PageCrawl::where('page_id', $page->id)->first(); - - $this->assertNotNull($crawl); - $this->assertNull($crawl->outcome); - } - public function test_first_or_create_with_existing_url_does_not_insert_duplicate_crawl(): void { $url = 'https://example-blog.com/article'; diff --git a/tests/Unit/Models/PageCrawlFactoryTest.php b/tests/Unit/Models/PageCrawlFactoryTest.php index 21990fa..df9c02f 100644 --- a/tests/Unit/Models/PageCrawlFactoryTest.php +++ b/tests/Unit/Models/PageCrawlFactoryTest.php @@ -9,6 +9,7 @@ use App\Models\PageCrawl; use Carbon\Carbon; use Illuminate\Foundation\Testing\RefreshDatabase; +use Illuminate\Support\Facades\Queue; use Tests\TestCase; class PageCrawlFactoryTest extends TestCase @@ -17,6 +18,8 @@ class PageCrawlFactoryTest extends TestCase public function test_factory_successful_state_produces_success_outcome(): void { + Queue::fake(); + $page = Page::factory()->create(); $crawl = PageCrawl::factory()->page($page)->successful()->create(); @@ -27,6 +30,8 @@ public function test_factory_successful_state_produces_success_outcome(): void public function test_factory_failed_state_produces_failed_outcome_with_message(): void { + Queue::fake(); + $page = Page::factory()->create(); $crawl = PageCrawl::factory()->page($page)->failed('Connection timed out')->create(); diff --git a/tests/Unit/Models/PageCrawlTest.php b/tests/Unit/Models/PageCrawlTest.php index 73fdad0..08f8a59 100644 --- a/tests/Unit/Models/PageCrawlTest.php +++ b/tests/Unit/Models/PageCrawlTest.php @@ -9,6 +9,7 @@ use App\Models\PageCrawl; use Carbon\Carbon; use Illuminate\Foundation\Testing\RefreshDatabase; +use Illuminate\Support\Facades\Queue; use Tests\TestCase; class PageCrawlTest extends TestCase @@ -17,6 +18,8 @@ class PageCrawlTest extends TestCase public function test_page_crawl_fillable_fields_persist_and_casts_are_applied(): void { + Queue::fake(); + $page = Page::factory()->createQuietly(['url' => 'https://example.com/page-1']); $completedAt = Carbon::parse('2026-05-01 10:01:05'); @@ -90,6 +93,8 @@ public function test_deleting_a_page_cascades_to_its_page_crawls(): void public function test_pending_crawls_are_filtered_by_null_outcome(): void { + Queue::fake(); + // createQuietly() skips the PageObserver; this test counts rows with null/non-null // outcome — the auto-inserted observer crawl (outcome=null) would corrupt both counts. $page = Page::factory()->createQuietly(['url' => 'https://example.com/page-pending']); diff --git a/tests/Unit/Models/PageTest.php b/tests/Unit/Models/PageTest.php index 27e9740..3e08b56 100644 --- a/tests/Unit/Models/PageTest.php +++ b/tests/Unit/Models/PageTest.php @@ -10,6 +10,7 @@ use App\Models\PageLink; use Carbon\Carbon; use Illuminate\Foundation\Testing\RefreshDatabase; +use Illuminate\Support\Facades\Queue; use Lvl0\FediDiscover\Config\InstanceType; use Lvl0\FediDiscover\Models\Instance; use Tests\TestCase; @@ -18,6 +19,12 @@ class PageTest extends TestCase { use RefreshDatabase; + protected function setUp(): void + { + parent::setUp(); + Queue::fake(); + } + public function test_page_model_fillable_fields_can_be_mass_assigned(): void { $page = Page::create([ From e8a935ea3152b4c99bb61db7b3fa6ac1c81fe7c3 Mon Sep 17 00:00:00 2001 From: myrmidex Date: Sun, 26 Apr 2026 23:54:40 +0200 Subject: [PATCH 42/65] 14 - Document queue worker container in deployment README --- README.md | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/README.md b/README.md index 2d5a5d1..4fbbe84 100644 --- a/README.md +++ b/README.md @@ -36,6 +36,7 @@ ### Required environment ### Services you need to provide - **App**: pull `forge.lvl0.xyz/lvl0/trove:latest` (or a pinned `v*` tag). Exposes port `8000` inside the container. The image runs migrations and warms caches on boot. +- **Worker**: same image as `app`, with `command: php artisan queue:work --tries=3 --max-time=3600`. Processes the crawler queue (URL fetching, content extraction, retries). Crawls won't actually run without this — `app` only enqueues work. **Required for the crawler to function.** - **PostgreSQL 17**. Hostname must be reachable as `db` (default) or set `DB_HOST`. Persist `/var/lib/postgresql/data`. - **Redis 7** with `--appendonly yes` (queue jobs persist across restarts). Hostname `redis` or set `REDIS_HOST`. @@ -71,6 +72,22 @@ ### Example compose stack db: { condition: service_healthy } redis: { condition: service_healthy } + worker: + image: forge.lvl0.xyz/lvl0/trove:latest + restart: always + command: php artisan queue:work --tries=3 --max-time=3600 + environment: + APP_KEY: "${APP_KEY}" + APP_URL: "${APP_URL}" + DB_DATABASE: "${DB_DATABASE}" + DB_USERNAME: "${DB_USERNAME}" + DB_PASSWORD: "${DB_PASSWORD}" + volumes: + - app_storage:/app/storage + depends_on: + db: { condition: service_healthy } + redis: { condition: service_healthy } + db: image: postgres:17-alpine restart: always From 3297c4bb3bfecdbd5499dbe98f6a89adab75cc10 Mon Sep 17 00:00:00 2001 From: myrmidex Date: Mon, 27 Apr 2026 00:18:34 +0200 Subject: [PATCH 43/65] 14 - Fix ProcessCrawlJob outcome write and status mapping bugs --- app/Jobs/ProcessCrawlJob.php | 41 ++++-- tests/Feature/Jobs/ProcessCrawlJobTest.php | 138 +++++++++++++++++++++ 2 files changed, 172 insertions(+), 7 deletions(-) diff --git a/app/Jobs/ProcessCrawlJob.php b/app/Jobs/ProcessCrawlJob.php index a8b4513..11a2993 100644 --- a/app/Jobs/ProcessCrawlJob.php +++ b/app/Jobs/ProcessCrawlJob.php @@ -26,11 +26,7 @@ public function handle( /** @var FetchResult $result */ $result = $fetcher($this->pageCrawl->page->url); - $this->pageCrawl->update([ - 'outcome' => CrawlOutcomeEnum::Success, - 'completed_at' => now(), - 'status_code' => 200, - ]); + $this->updatePageCrawl($result); $update = match ($result->outcome) { CrawlOutcomeEnum::Rejected => [ @@ -41,10 +37,19 @@ public function handle( 'status' => PageStatusEnum::Failed, 'failed_at' => now(), ], + CrawlOutcomeEnum::Failed => [ + 'status' => PageStatusEnum::Failed, + ], CrawlOutcomeEnum::Blocked4xx => [ 'status' => PageStatusEnum::Failed, 'failed_at' => now(), ], + CrawlOutcomeEnum::Blocked5xx => [ + 'status' => PageStatusEnum::Failed, + ], + CrawlOutcomeEnum::BlockedRobots => [ + 'status' => PageStatusEnum::Failed, + ], default => [ 'status' => PageStatusEnum::Fetched, 'fetched_at' => now(), @@ -52,10 +57,12 @@ public function handle( ], }; - $result->outboundLinks->each(fn (string $url) => $register($url)); - $this->pageCrawl->page->update($update); + if ($result->outcome !== CrawlOutcomeEnum::Failed) { + $result->outboundLinks->each(fn (string $url) => $register($url)); + } + if (in_array($result->outcome, [ CrawlOutcomeEnum::Failed, CrawlOutcomeEnum::Timeout, @@ -81,4 +88,24 @@ private function scheduleRetryIfNeeded(FetchResult $result, PageCrawl $crawl): v ProcessCrawlJob::dispatch($newRow)->delay(now()->addHour()); } + + public function updatePageCrawl(FetchResult $result): void + { + $outcome = CrawlOutcomeEnum::Success; + $errorMessage = null; + $statusCode = 200; + + if ($result->outcome === CrawlOutcomeEnum::Failed) { + $outcome = CrawlOutcomeEnum::Failed; + $errorMessage = $result->errorMessage; + $statusCode = null; + } + + $this->pageCrawl->update([ + 'outcome' => $outcome, + 'completed_at' => now(), + 'status_code' => $statusCode, + 'error_message' => $errorMessage, + ]); + } } diff --git a/tests/Feature/Jobs/ProcessCrawlJobTest.php b/tests/Feature/Jobs/ProcessCrawlJobTest.php index 8503089..dab484b 100644 --- a/tests/Feature/Jobs/ProcessCrawlJobTest.php +++ b/tests/Feature/Jobs/ProcessCrawlJobTest.php @@ -266,6 +266,144 @@ public function test_handle_does_not_retry_after_three_attempts(): void Queue::assertNotPushed(ProcessCrawlJob::class); } + public function test_handle_writes_failed_outcome_to_page_crawl(): void + { + Queue::fake(); + + $fetcher = Mockery::mock(FetchPageAction::class); + $fetcher->shouldReceive('__invoke')->andReturn(new FetchResult( + outcome: CrawlOutcomeEnum::Failed, + statusCode: null, + finalUrl: null, + title: null, + extractedText: null, + outboundLinks: collect(), + wordCount: null, + errorMessage: 'boom', + )); + $this->app->instance(FetchPageAction::class, $fetcher); + + $page = Page::factory()->createQuietly(['url' => 'https://example.com/unstable']); + $crawl = PageCrawl::factory()->page($page)->createQuietly(); + + app(ProcessCrawlJob::class, ['pageCrawl' => $crawl]) + ->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class)); + + $this->assertDatabaseHas('page_crawls', [ + 'id' => $crawl->id, + 'outcome' => CrawlOutcomeEnum::Failed->value, + 'status_code' => null, + 'error_message' => 'boom', + ]); + } + + public function test_handle_updates_page_to_failed_on_failed_outcome(): void + { + Queue::fake(); + + $fetcher = Mockery::mock(FetchPageAction::class); + $fetcher->shouldReceive('__invoke')->andReturn(new FetchResult( + outcome: CrawlOutcomeEnum::Failed, + statusCode: null, + finalUrl: null, + title: null, + extractedText: null, + outboundLinks: collect(), + wordCount: null, + errorMessage: 'Connection refused', + )); + $this->app->instance(FetchPageAction::class, $fetcher); + + $page = Page::factory()->createQuietly(['url' => 'https://example.com/unreachable']); + $crawl = PageCrawl::factory()->page($page)->createQuietly(); + + app(ProcessCrawlJob::class, ['pageCrawl' => $crawl]) + ->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class)); + + $this->assertSame(PageStatusEnum::Failed, $page->fresh()->status); + } + + public function test_handle_updates_page_to_failed_on_blocked_5xx(): void + { + Queue::fake(); + + $fetcher = Mockery::mock(FetchPageAction::class); + $fetcher->shouldReceive('__invoke')->andReturn(new FetchResult( + outcome: CrawlOutcomeEnum::Blocked5xx, + statusCode: 503, + finalUrl: null, + title: null, + extractedText: null, + outboundLinks: collect(), + wordCount: null, + errorMessage: 'HTTP 503', + )); + $this->app->instance(FetchPageAction::class, $fetcher); + + $page = Page::factory()->createQuietly(['url' => 'https://example.com/overloaded']); + $crawl = PageCrawl::factory()->page($page)->createQuietly(); + + app(ProcessCrawlJob::class, ['pageCrawl' => $crawl]) + ->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class)); + + $this->assertSame(PageStatusEnum::Failed, $page->fresh()->status); + } + + public function test_handle_updates_page_to_failed_on_blocked_robots(): void + { + Queue::fake(); + + $fetcher = Mockery::mock(FetchPageAction::class); + $fetcher->shouldReceive('__invoke')->andReturn(new FetchResult( + outcome: CrawlOutcomeEnum::BlockedRobots, + statusCode: null, + finalUrl: null, + title: null, + extractedText: null, + outboundLinks: collect(), + wordCount: null, + errorMessage: 'Disallowed by robots.txt', + )); + $this->app->instance(FetchPageAction::class, $fetcher); + + $page = Page::factory()->createQuietly(['url' => 'https://example.com/private']); + $crawl = PageCrawl::factory()->page($page)->createQuietly(); + + app(ProcessCrawlJob::class, ['pageCrawl' => $crawl]) + ->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class)); + + $this->assertSame(PageStatusEnum::Failed, $page->fresh()->status); + } + + public function test_handle_does_not_register_outbound_links_on_failure(): void + { + Queue::fake(); + + $fetcher = Mockery::mock(FetchPageAction::class); + $fetcher->shouldReceive('__invoke')->andReturn(new FetchResult( + outcome: CrawlOutcomeEnum::Failed, + statusCode: null, + finalUrl: null, + title: null, + extractedText: null, + outboundLinks: collect([ + 'https://should-not-be-registered.com/page', + ]), + wordCount: null, + errorMessage: 'Connection refused', + )); + $this->app->instance(FetchPageAction::class, $fetcher); + + $page = Page::factory()->createQuietly(['url' => 'https://example.com/broken']); + $crawl = PageCrawl::factory()->page($page)->createQuietly(); + + app(ProcessCrawlJob::class, ['pageCrawl' => $crawl]) + ->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class)); + + $this->assertDatabaseMissing('pages', ['url' => 'https://should-not-be-registered.com/page']); + $this->assertSame(1, Page::count()); + } + public function test_handle_registers_outbound_links_on_success(): void { Queue::fake(); From c80be24e6ecfdd78c2d44b7fe7c638d61ebbbc0a Mon Sep 17 00:00:00 2001 From: myrmidex Date: Mon, 27 Apr 2026 00:24:38 +0200 Subject: [PATCH 44/65] chore - Extract mockFetchPageAction helper in ProcessCrawlJobTest --- tests/Feature/Jobs/ProcessCrawlJobTest.php | 199 +++++---------------- 1 file changed, 44 insertions(+), 155 deletions(-) diff --git a/tests/Feature/Jobs/ProcessCrawlJobTest.php b/tests/Feature/Jobs/ProcessCrawlJobTest.php index dab484b..487f6d7 100644 --- a/tests/Feature/Jobs/ProcessCrawlJobTest.php +++ b/tests/Feature/Jobs/ProcessCrawlJobTest.php @@ -14,6 +14,7 @@ use App\ValueObjects\FetchResult; use Carbon\Carbon; use Illuminate\Foundation\Testing\RefreshDatabase; +use Illuminate\Support\Collection; use Illuminate\Support\Facades\Queue; use Mockery; use Tests\TestCase; @@ -49,18 +50,7 @@ public function test_handle_writes_outcome_to_page_crawl_on_success(): void { Queue::fake(); - $fetcher = Mockery::mock(FetchPageAction::class); - $fetcher->shouldReceive('__invoke')->andReturn(new FetchResult( - outcome: CrawlOutcomeEnum::Success, - statusCode: 200, - finalUrl: 'https://example.com/article', - title: 'Hello', - extractedText: 'hi', - outboundLinks: collect(), - wordCount: 1, - errorMessage: null, - )); - $this->app->instance(FetchPageAction::class, $fetcher); + $this->mockFetchPageAction(CrawlOutcomeEnum::Success, statusCode: 200, title: 'Hello', extractedText: 'hi', wordCount: 1); $page = Page::factory()->createQuietly(['url' => 'https://example.com/article']); $crawl = PageCrawl::factory()->page($page)->createQuietly(); @@ -80,18 +70,7 @@ public function test_handle_updates_page_to_fetched_on_success(): void { Queue::fake(); - $fetcher = Mockery::mock(FetchPageAction::class); - $fetcher->shouldReceive('__invoke')->andReturn(new FetchResult( - outcome: CrawlOutcomeEnum::Success, - statusCode: 200, - finalUrl: 'https://example.com/article', - title: 'Hello', - extractedText: 'hi', - outboundLinks: collect(), - wordCount: 1, - errorMessage: null, - )); - $this->app->instance(FetchPageAction::class, $fetcher); + $this->mockFetchPageAction(CrawlOutcomeEnum::Success, statusCode: 200, title: 'Hello', extractedText: 'hi', wordCount: 1); $page = Page::factory()->createQuietly(['url' => 'https://example.com/article']); $crawl = PageCrawl::factory()->page($page)->createQuietly(); @@ -110,18 +89,7 @@ public function test_handle_updates_page_to_rejected_on_rejected_outcome(): void { Queue::fake(); - $fetcher = Mockery::mock(FetchPageAction::class); - $fetcher->shouldReceive('__invoke')->andReturn(new FetchResult( - outcome: CrawlOutcomeEnum::Rejected, - statusCode: 200, - finalUrl: null, - title: null, - extractedText: null, - outboundLinks: collect(), - wordCount: null, - errorMessage: 'Unsupported Content-Type: application/pdf', - )); - $this->app->instance(FetchPageAction::class, $fetcher); + $this->mockFetchPageAction(CrawlOutcomeEnum::Rejected, statusCode: 200, errorMessage: 'Unsupported Content-Type: application/pdf'); $page = Page::factory()->createQuietly(['url' => 'https://example.com/brochure.pdf']); $crawl = PageCrawl::factory()->page($page)->createQuietly(); @@ -138,18 +106,7 @@ public function test_handle_updates_page_to_failed_on_blocked_4xx(): void { Queue::fake(); - $fetcher = Mockery::mock(FetchPageAction::class); - $fetcher->shouldReceive('__invoke')->andReturn(new FetchResult( - outcome: CrawlOutcomeEnum::Blocked4xx, - statusCode: 404, - finalUrl: null, - title: null, - extractedText: null, - outboundLinks: collect(), - wordCount: null, - errorMessage: 'HTTP 404', - )); - $this->app->instance(FetchPageAction::class, $fetcher); + $this->mockFetchPageAction(CrawlOutcomeEnum::Blocked4xx, statusCode: 404, errorMessage: 'HTTP 404'); $page = Page::factory()->createQuietly(['url' => 'https://example.com/gone']); $crawl = PageCrawl::factory()->page($page)->createQuietly(); @@ -167,18 +124,7 @@ public function test_handle_updates_page_to_failed_on_timeout(): void { Queue::fake(); - $fetcher = Mockery::mock(FetchPageAction::class); - $fetcher->shouldReceive('__invoke')->andReturn(new FetchResult( - outcome: CrawlOutcomeEnum::Timeout, - statusCode: null, - finalUrl: null, - title: null, - extractedText: null, - outboundLinks: collect(), - wordCount: null, - errorMessage: 'Connection timed out after 10 seconds', - )); - $this->app->instance(FetchPageAction::class, $fetcher); + $this->mockFetchPageAction(CrawlOutcomeEnum::Timeout, errorMessage: 'Connection timed out after 10 seconds'); $page = Page::factory()->createQuietly(['url' => 'https://example.com/slow']); $crawl = PageCrawl::factory()->page($page)->createQuietly(); @@ -196,18 +142,7 @@ public function test_handle_schedules_retry_on_transient_failure(): void { Queue::fake(); - $fetcher = Mockery::mock(FetchPageAction::class); - $fetcher->shouldReceive('__invoke')->andReturn(new FetchResult( - outcome: CrawlOutcomeEnum::Failed, - statusCode: null, - finalUrl: null, - title: null, - extractedText: null, - outboundLinks: collect(), - wordCount: null, - errorMessage: 'Connection refused', - )); - $this->app->instance(FetchPageAction::class, $fetcher); + $this->mockFetchPageAction(CrawlOutcomeEnum::Failed, errorMessage: 'Connection refused'); $page = Page::factory()->createQuietly(['url' => 'https://example.com/unstable']); $crawl = PageCrawl::factory()->page($page)->createQuietly(); @@ -236,18 +171,7 @@ public function test_handle_does_not_retry_after_three_attempts(): void { Queue::fake(); - $fetcher = Mockery::mock(FetchPageAction::class); - $fetcher->shouldReceive('__invoke')->andReturn(new FetchResult( - outcome: CrawlOutcomeEnum::Failed, - statusCode: null, - finalUrl: null, - title: null, - extractedText: null, - outboundLinks: collect(), - wordCount: null, - errorMessage: 'Connection refused', - )); - $this->app->instance(FetchPageAction::class, $fetcher); + $this->mockFetchPageAction(CrawlOutcomeEnum::Failed, errorMessage: 'Connection refused'); $page = Page::factory()->createQuietly(['url' => 'https://example.com/unreachable']); @@ -270,18 +194,7 @@ public function test_handle_writes_failed_outcome_to_page_crawl(): void { Queue::fake(); - $fetcher = Mockery::mock(FetchPageAction::class); - $fetcher->shouldReceive('__invoke')->andReturn(new FetchResult( - outcome: CrawlOutcomeEnum::Failed, - statusCode: null, - finalUrl: null, - title: null, - extractedText: null, - outboundLinks: collect(), - wordCount: null, - errorMessage: 'boom', - )); - $this->app->instance(FetchPageAction::class, $fetcher); + $this->mockFetchPageAction(CrawlOutcomeEnum::Failed, errorMessage: 'boom'); $page = Page::factory()->createQuietly(['url' => 'https://example.com/unstable']); $crawl = PageCrawl::factory()->page($page)->createQuietly(); @@ -301,18 +214,7 @@ public function test_handle_updates_page_to_failed_on_failed_outcome(): void { Queue::fake(); - $fetcher = Mockery::mock(FetchPageAction::class); - $fetcher->shouldReceive('__invoke')->andReturn(new FetchResult( - outcome: CrawlOutcomeEnum::Failed, - statusCode: null, - finalUrl: null, - title: null, - extractedText: null, - outboundLinks: collect(), - wordCount: null, - errorMessage: 'Connection refused', - )); - $this->app->instance(FetchPageAction::class, $fetcher); + $this->mockFetchPageAction(CrawlOutcomeEnum::Failed, errorMessage: 'Connection refused'); $page = Page::factory()->createQuietly(['url' => 'https://example.com/unreachable']); $crawl = PageCrawl::factory()->page($page)->createQuietly(); @@ -327,18 +229,7 @@ public function test_handle_updates_page_to_failed_on_blocked_5xx(): void { Queue::fake(); - $fetcher = Mockery::mock(FetchPageAction::class); - $fetcher->shouldReceive('__invoke')->andReturn(new FetchResult( - outcome: CrawlOutcomeEnum::Blocked5xx, - statusCode: 503, - finalUrl: null, - title: null, - extractedText: null, - outboundLinks: collect(), - wordCount: null, - errorMessage: 'HTTP 503', - )); - $this->app->instance(FetchPageAction::class, $fetcher); + $this->mockFetchPageAction(CrawlOutcomeEnum::Blocked5xx, statusCode: 503, errorMessage: 'HTTP 503'); $page = Page::factory()->createQuietly(['url' => 'https://example.com/overloaded']); $crawl = PageCrawl::factory()->page($page)->createQuietly(); @@ -353,18 +244,7 @@ public function test_handle_updates_page_to_failed_on_blocked_robots(): void { Queue::fake(); - $fetcher = Mockery::mock(FetchPageAction::class); - $fetcher->shouldReceive('__invoke')->andReturn(new FetchResult( - outcome: CrawlOutcomeEnum::BlockedRobots, - statusCode: null, - finalUrl: null, - title: null, - extractedText: null, - outboundLinks: collect(), - wordCount: null, - errorMessage: 'Disallowed by robots.txt', - )); - $this->app->instance(FetchPageAction::class, $fetcher); + $this->mockFetchPageAction(CrawlOutcomeEnum::BlockedRobots, errorMessage: 'Disallowed by robots.txt'); $page = Page::factory()->createQuietly(['url' => 'https://example.com/private']); $crawl = PageCrawl::factory()->page($page)->createQuietly(); @@ -379,20 +259,11 @@ public function test_handle_does_not_register_outbound_links_on_failure(): void { Queue::fake(); - $fetcher = Mockery::mock(FetchPageAction::class); - $fetcher->shouldReceive('__invoke')->andReturn(new FetchResult( - outcome: CrawlOutcomeEnum::Failed, - statusCode: null, - finalUrl: null, - title: null, - extractedText: null, - outboundLinks: collect([ - 'https://should-not-be-registered.com/page', - ]), - wordCount: null, + $this->mockFetchPageAction( + CrawlOutcomeEnum::Failed, + outboundLinks: collect(['https://should-not-be-registered.com/page']), errorMessage: 'Connection refused', - )); - $this->app->instance(FetchPageAction::class, $fetcher); + ); $page = Page::factory()->createQuietly(['url' => 'https://example.com/broken']); $crawl = PageCrawl::factory()->page($page)->createQuietly(); @@ -408,21 +279,15 @@ public function test_handle_registers_outbound_links_on_success(): void { Queue::fake(); - $fetcher = Mockery::mock(FetchPageAction::class); - $fetcher->shouldReceive('__invoke')->andReturn(new FetchResult( - outcome: CrawlOutcomeEnum::Success, + $this->mockFetchPageAction( + CrawlOutcomeEnum::Success, statusCode: 200, finalUrl: 'https://source.com/article', title: 'Source Article', extractedText: 'some text', - outboundLinks: collect([ - 'https://other.com/article-1', - 'https://another.com/post-2', - ]), + outboundLinks: collect(['https://other.com/article-1', 'https://another.com/post-2']), wordCount: 2, - errorMessage: null, - )); - $this->app->instance(FetchPageAction::class, $fetcher); + ); $page = Page::factory()->createQuietly(['url' => 'https://source.com/article']); $crawl = PageCrawl::factory()->page($page)->createQuietly(); @@ -434,4 +299,28 @@ public function test_handle_registers_outbound_links_on_success(): void $this->assertDatabaseHas('pages', ['url' => 'https://another.com/post-2']); $this->assertSame(3, Page::count()); } + + private function mockFetchPageAction( + CrawlOutcomeEnum $outcome, + ?int $statusCode = null, + ?string $finalUrl = 'https://example.com/article', + ?string $title = null, + ?string $extractedText = null, + ?Collection $outboundLinks = null, + ?int $wordCount = null, + ?string $errorMessage = null, + ): void { + $fetcher = Mockery::mock(FetchPageAction::class); + $fetcher->shouldReceive('__invoke')->andReturn(new FetchResult( + outcome: $outcome, + statusCode: $statusCode, + finalUrl: $finalUrl, + title: $title, + extractedText: $extractedText, + outboundLinks: $outboundLinks ?? collect(), + wordCount: $wordCount, + errorMessage: $errorMessage, + )); + $this->app->instance(FetchPageAction::class, $fetcher); + } } From 69aa5d9d3e87677653f451208339179b96da0b6b Mon Sep 17 00:00:00 2001 From: myrmidex Date: Mon, 27 Apr 2026 00:41:12 +0200 Subject: [PATCH 45/65] 10 - Add /bot page with crawler identity and opt-out instructions --- resources/views/bot.blade.php | 60 +++++++++++++++++++++++++++++++++++ routes/web.php | 2 ++ tests/Feature/BotPageTest.php | 39 +++++++++++++++++++++++ 3 files changed, 101 insertions(+) create mode 100644 resources/views/bot.blade.php create mode 100644 tests/Feature/BotPageTest.php diff --git a/resources/views/bot.blade.php b/resources/views/bot.blade.php new file mode 100644 index 0000000..4859039 --- /dev/null +++ b/resources/views/bot.blade.php @@ -0,0 +1,60 @@ + +
+

About TroveBot

+ +

+ Trove is a federated search engine for the small web, + seeded by fediverse attention and ranked by domain coherence rather than + commercial authority. TroveBot is its crawler — it + discovers and indexes URLs shared by people on the fediverse, then + follows the citations they make to find more of the small web. +

+ +

Identity

+ +

TroveBot identifies itself with the following User-Agent string:

+ +
TroveBot/0.1 (+https://trove.lvl0.xyz/bot)
+ +

Crawling behavior

+ +
    +
  • Respects robots.txt rules under User-agent: TroveBot (and the wildcard User-agent: * as a fallback).
  • +
  • Polite per-domain rate limit — at most a few requests per minute per host.
  • +
  • Follows up to 5 redirects per URL.
  • +
  • Fetches HTML only. PDFs, images, and other binary content are recorded as discovered but never re-fetched.
  • +
  • Does not execute JavaScript, does not crawl behind authentication, does not crawl URLs containing user credentials.
  • +
+ +

Opt out

+ +

+ Block TroveBot entirely by adding the following to your site's + robots.txt: +

+ +
User-agent: TroveBot
+Disallow: /
+ +

+ Or block specific paths: +

+ +
User-agent: TroveBot
+Disallow: /private/
+Disallow: /admin/
+ +

Contact & source

+ + +
+
diff --git a/routes/web.php b/routes/web.php index 5f96afc..6199875 100644 --- a/routes/web.php +++ b/routes/web.php @@ -9,3 +9,5 @@ }); Route::view('/submit', 'urls.submit'); + +Route::view('/bot', 'bot'); diff --git a/tests/Feature/BotPageTest.php b/tests/Feature/BotPageTest.php new file mode 100644 index 0000000..e544b00 --- /dev/null +++ b/tests/Feature/BotPageTest.php @@ -0,0 +1,39 @@ +get('/bot'); + + $response->assertStatus(200); + } + + public function test_bot_page_contains_user_agent_string(): void + { + $response = $this->get('/bot'); + + $response->assertSee('TroveBot/0.1 (+https://trove.lvl0.xyz/bot)', escape: false); + } + + public function test_bot_page_contains_robots_txt_opt_out_example(): void + { + $response = $this->get('/bot'); + + $response->assertSee('User-agent: TroveBot', escape: false); + $response->assertSee('Disallow: /', escape: false); + } + + public function test_bot_page_links_to_forge_repository(): void + { + $response = $this->get('/bot'); + + $response->assertSee('https://forge.lvl0.xyz/lvl0/trove', escape: false); + } +} From 71713483707912b60d13643b273b000b36fc5ad2 Mon Sep 17 00:00:00 2001 From: myrmidex Date: Mon, 27 Apr 2026 00:59:45 +0200 Subject: [PATCH 46/65] 11 - Add PolitenessService and crawler delay config --- .env.example | 2 ++ app/Services/PolitenessService.php | 19 +++++++++++++++ config/crawler.php | 2 ++ tests/Unit/Services/PolitenessServiceTest.php | 23 +++++++++++++++++++ 4 files changed, 46 insertions(+) create mode 100644 app/Services/PolitenessService.php create mode 100644 tests/Unit/Services/PolitenessServiceTest.php diff --git a/.env.example b/.env.example index 79935dc..f83cd0b 100644 --- a/.env.example +++ b/.env.example @@ -61,3 +61,5 @@ AWS_BUCKET= AWS_USE_PATH_STYLE_ENDPOINT=false VITE_APP_NAME="${APP_NAME}" + +CRAWLER_MIN_DOMAIN_DELAY_SECONDS=10 diff --git a/app/Services/PolitenessService.php b/app/Services/PolitenessService.php new file mode 100644 index 0000000..5114458 --- /dev/null +++ b/app/Services/PolitenessService.php @@ -0,0 +1,19 @@ + env('CRAWLER_USER_AGENT', 'TroveBot/0.1 (+https://trove.lvl0.xyz/bot)'), + + 'min_domain_delay_seconds' => env('CRAWLER_MIN_DOMAIN_DELAY_SECONDS', 10), ]; diff --git a/tests/Unit/Services/PolitenessServiceTest.php b/tests/Unit/Services/PolitenessServiceTest.php new file mode 100644 index 0000000..f9a2c6b --- /dev/null +++ b/tests/Unit/Services/PolitenessServiceTest.php @@ -0,0 +1,23 @@ +assertSame(10, (new PolitenessService)->minDelayFor('example.com')); + } + + public function test_min_delay_for_respects_config_override(): void + { + config()->set('crawler.min_domain_delay_seconds', 30); + + $this->assertSame(30, (new PolitenessService)->minDelayFor('example.com')); + } +} From 1538ceeb6e8cfaf7d4e918ae2c139e95d2c3fbaa Mon Sep 17 00:00:00 2001 From: myrmidex Date: Mon, 27 Apr 2026 01:25:46 +0200 Subject: [PATCH 47/65] 11 - Gate ProcessCrawlJob with per-domain politeness lock --- app/Jobs/ProcessCrawlJob.php | 21 ++++- tests/Feature/Jobs/ProcessCrawlJobTest.php | 95 ++++++++++++++++++---- 2 files changed, 98 insertions(+), 18 deletions(-) diff --git a/app/Jobs/ProcessCrawlJob.php b/app/Jobs/ProcessCrawlJob.php index 11a2993..d2928d0 100644 --- a/app/Jobs/ProcessCrawlJob.php +++ b/app/Jobs/ProcessCrawlJob.php @@ -7,9 +7,11 @@ use App\Enums\CrawlOutcomeEnum; use App\Enums\PageStatusEnum; use App\Models\PageCrawl; +use App\Services\PolitenessService; use App\ValueObjects\FetchResult; use Illuminate\Contracts\Queue\ShouldQueue; use Illuminate\Foundation\Queue\Queueable; +use Illuminate\Support\Facades\Cache; class ProcessCrawlJob implements ShouldQueue { @@ -19,10 +21,21 @@ public function __construct( public PageCrawl $pageCrawl, ) {} - public function handle( - FetchPageAction $fetcher, - RegisterDiscoveredPageAction $register, - ): void { + public function handle(): void + { + $fetcher = resolve(FetchPageAction::class); + $register = resolve(RegisterDiscoveredPageAction::class); + $politenessService = resolve(PolitenessService::class); + + $delay = $politenessService->minDelayFor($this->pageCrawl->domain); + $lock = Cache::lock("crawler:domain:{$this->pageCrawl->domain}", $delay); + + if (! $lock->get()) { + $this->release($delay); + + return; + } + /** @var FetchResult $result */ $result = $fetcher($this->pageCrawl->page->url); diff --git a/tests/Feature/Jobs/ProcessCrawlJobTest.php b/tests/Feature/Jobs/ProcessCrawlJobTest.php index 487f6d7..f504cb6 100644 --- a/tests/Feature/Jobs/ProcessCrawlJobTest.php +++ b/tests/Feature/Jobs/ProcessCrawlJobTest.php @@ -5,7 +5,6 @@ namespace Tests\Feature\Jobs; use App\Actions\FetchPageAction; -use App\Actions\RegisterDiscoveredPageAction; use App\Enums\CrawlOutcomeEnum; use App\Enums\PageStatusEnum; use App\Jobs\ProcessCrawlJob; @@ -15,6 +14,7 @@ use Carbon\Carbon; use Illuminate\Foundation\Testing\RefreshDatabase; use Illuminate\Support\Collection; +use Illuminate\Support\Facades\Cache; use Illuminate\Support\Facades\Queue; use Mockery; use Tests\TestCase; @@ -56,7 +56,7 @@ public function test_handle_writes_outcome_to_page_crawl_on_success(): void $crawl = PageCrawl::factory()->page($page)->createQuietly(); app(ProcessCrawlJob::class, ['pageCrawl' => $crawl]) - ->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class)); + ->handle(); $fresh = $crawl->fresh(); $this->assertSame(CrawlOutcomeEnum::Success, $fresh->outcome); @@ -76,7 +76,7 @@ public function test_handle_updates_page_to_fetched_on_success(): void $crawl = PageCrawl::factory()->page($page)->createQuietly(); app(ProcessCrawlJob::class, ['pageCrawl' => $crawl]) - ->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class)); + ->handle(); $fresh = $page->fresh(); $this->assertSame(PageStatusEnum::Fetched, $fresh->status); @@ -95,7 +95,7 @@ public function test_handle_updates_page_to_rejected_on_rejected_outcome(): void $crawl = PageCrawl::factory()->page($page)->createQuietly(); app(ProcessCrawlJob::class, ['pageCrawl' => $crawl]) - ->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class)); + ->handle(); $fresh = $page->fresh(); $this->assertSame(PageStatusEnum::Rejected, $fresh->status); @@ -112,7 +112,7 @@ public function test_handle_updates_page_to_failed_on_blocked_4xx(): void $crawl = PageCrawl::factory()->page($page)->createQuietly(); app(ProcessCrawlJob::class, ['pageCrawl' => $crawl]) - ->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class)); + ->handle(); $fresh = $page->fresh(); $this->assertSame(PageStatusEnum::Failed, $fresh->status); @@ -130,7 +130,7 @@ public function test_handle_updates_page_to_failed_on_timeout(): void $crawl = PageCrawl::factory()->page($page)->createQuietly(); app(ProcessCrawlJob::class, ['pageCrawl' => $crawl]) - ->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class)); + ->handle(); $fresh = $page->fresh(); $this->assertSame(PageStatusEnum::Failed, $fresh->status); @@ -148,7 +148,7 @@ public function test_handle_schedules_retry_on_transient_failure(): void $crawl = PageCrawl::factory()->page($page)->createQuietly(); app(ProcessCrawlJob::class, ['pageCrawl' => $crawl]) - ->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class)); + ->handle(); // A second PageCrawl row (the retry) must have been inserted for the same page $this->assertSame(2, PageCrawl::where('page_id', $page->id)->count()); @@ -181,7 +181,7 @@ public function test_handle_does_not_retry_after_three_attempts(): void $thirdCrawl = PageCrawl::factory()->page($page)->createQuietly(); app(ProcessCrawlJob::class, ['pageCrawl' => $thirdCrawl]) - ->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class)); + ->handle(); // No 4th row must appear — retry cap reached $this->assertSame(3, PageCrawl::where('page_id', $page->id)->count()); @@ -200,7 +200,7 @@ public function test_handle_writes_failed_outcome_to_page_crawl(): void $crawl = PageCrawl::factory()->page($page)->createQuietly(); app(ProcessCrawlJob::class, ['pageCrawl' => $crawl]) - ->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class)); + ->handle(); $this->assertDatabaseHas('page_crawls', [ 'id' => $crawl->id, @@ -220,7 +220,7 @@ public function test_handle_updates_page_to_failed_on_failed_outcome(): void $crawl = PageCrawl::factory()->page($page)->createQuietly(); app(ProcessCrawlJob::class, ['pageCrawl' => $crawl]) - ->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class)); + ->handle(); $this->assertSame(PageStatusEnum::Failed, $page->fresh()->status); } @@ -235,7 +235,7 @@ public function test_handle_updates_page_to_failed_on_blocked_5xx(): void $crawl = PageCrawl::factory()->page($page)->createQuietly(); app(ProcessCrawlJob::class, ['pageCrawl' => $crawl]) - ->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class)); + ->handle(); $this->assertSame(PageStatusEnum::Failed, $page->fresh()->status); } @@ -250,7 +250,7 @@ public function test_handle_updates_page_to_failed_on_blocked_robots(): void $crawl = PageCrawl::factory()->page($page)->createQuietly(); app(ProcessCrawlJob::class, ['pageCrawl' => $crawl]) - ->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class)); + ->handle(); $this->assertSame(PageStatusEnum::Failed, $page->fresh()->status); } @@ -269,7 +269,7 @@ public function test_handle_does_not_register_outbound_links_on_failure(): void $crawl = PageCrawl::factory()->page($page)->createQuietly(); app(ProcessCrawlJob::class, ['pageCrawl' => $crawl]) - ->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class)); + ->handle(); $this->assertDatabaseMissing('pages', ['url' => 'https://should-not-be-registered.com/page']); $this->assertSame(1, Page::count()); @@ -293,13 +293,80 @@ public function test_handle_registers_outbound_links_on_success(): void $crawl = PageCrawl::factory()->page($page)->createQuietly(); app(ProcessCrawlJob::class, ['pageCrawl' => $crawl]) - ->handle(app(FetchPageAction::class), app(RegisterDiscoveredPageAction::class)); + ->handle(); $this->assertDatabaseHas('pages', ['url' => 'https://other.com/article-1']); $this->assertDatabaseHas('pages', ['url' => 'https://another.com/post-2']); $this->assertSame(3, Page::count()); } + public function test_handle_releases_job_when_domain_is_locked(): void + { + Queue::fake(); + + // Pre-acquire the lock so the job sees it as already held + Cache::lock('crawler:domain:example.com', 10)->get(); + + // The fetcher must NOT be called — the job should bail before reaching it + $fetcher = Mockery::mock(FetchPageAction::class); + $fetcher->shouldNotReceive('__invoke'); + $this->app->instance(FetchPageAction::class, $fetcher); + + $page = Page::factory()->createQuietly(['url' => 'https://example.com/article']); + $crawl = PageCrawl::factory()->page($page)->createQuietly(); + + $job = new ProcessCrawlJob($crawl); + $job->handle(); + + // No outcome written — handle() returned early + $this->assertNull($crawl->fresh()->outcome); + + // Page status unchanged from its factory default (Discovered) + $this->assertSame(PageStatusEnum::Discovered, $page->fresh()->status); + } + + public function test_handle_does_not_release_lock_after_completion(): void + { + Queue::fake(); + + $this->mockFetchPageAction(CrawlOutcomeEnum::Success, statusCode: 200); + + $page = Page::factory()->createQuietly(['url' => 'https://example.com/article']); + $crawl = PageCrawl::factory()->page($page)->createQuietly(); + + $job = new ProcessCrawlJob($crawl); + $job->handle(); + + // If handle() called $lock->release(), this second get() would succeed (true). + // It must fail (false) — the lock acquired inside handle() must still be held. + $result = Cache::lock('crawler:domain:example.com', 10)->get(); + $this->assertFalse($result, 'Expected the domain lock to still be held after handle() completed, but it was free — the lock was released prematurely.'); + } + + public function test_handle_acquires_domain_lock_before_fetching(): void + { + Queue::fake(); + + $this->mockFetchPageAction(CrawlOutcomeEnum::Success, statusCode: 200); + + $page = Page::factory()->createQuietly(['url' => 'https://lock-test.example.com/article']); + $crawl = PageCrawl::factory()->page($page)->createQuietly(); + + $domain = $crawl->domain; + + app(ProcessCrawlJob::class, ['pageCrawl' => $crawl]) + ->handle(); + + // The lock must still be held after handle() completes — a second attempt to acquire it fails + $this->assertFalse( + Cache::lock("crawler:domain:{$domain}", 10)->get(), + 'Expected the domain lock to still be held after handle() ran, but it was free.', + ); + + // The fetch ran — outcome was written (proves the lock did not block execution) + $this->assertSame(CrawlOutcomeEnum::Success, $crawl->fresh()->outcome); + } + private function mockFetchPageAction( CrawlOutcomeEnum $outcome, ?int $statusCode = null, From 264180cd369d12a56792168266de81787d224f92 Mon Sep 17 00:00:00 2001 From: myrmidex Date: Mon, 27 Apr 2026 01:36:37 +0200 Subject: [PATCH 48/65] =?UTF-8?q?chore=20-=20Move=20outcome=20=E2=86=92=20?= =?UTF-8?q?status=20mapping=20into=20CrawlOutcomeEnum=20methods?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- app/Enums/CrawlOutcomeEnum.php | 37 ++++++++ app/Jobs/ProcessCrawlJob.php | 105 +++++++++------------- tests/Unit/Enums/CrawlOutcomeEnumTest.php | 39 ++++++++ 3 files changed, 119 insertions(+), 62 deletions(-) diff --git a/app/Enums/CrawlOutcomeEnum.php b/app/Enums/CrawlOutcomeEnum.php index 949cf69..582fdc9 100644 --- a/app/Enums/CrawlOutcomeEnum.php +++ b/app/Enums/CrawlOutcomeEnum.php @@ -20,4 +20,41 @@ enum CrawlOutcomeEnum: string * prevent re-discovery loops as fediverse re-shares the URL. */ case Rejected = 'rejected'; + + /** + * The PageStatusEnum value the parent `pages` row should land on for this outcome. + */ + public function toPageStatus(): PageStatusEnum + { + return match ($this) { + self::Success => PageStatusEnum::Fetched, + self::Rejected => PageStatusEnum::Rejected, + self::Failed, + self::Timeout, + self::BlockedRobots, + self::Blocked4xx, + self::Blocked5xx => PageStatusEnum::Failed, + }; + } + + /** + * True if the worker should retry this outcome (transient failures only). + * Permanent failures (4xx, robots block, rejected content type) and successes do not retry. + */ + public function isRetryable(): bool + { + return match ($this) { + self::Failed, self::Timeout, self::Blocked5xx => true, + self::Success, self::Rejected, self::BlockedRobots, self::Blocked4xx => false, + }; + } + + /** + * True if the worker should register the outbound links discovered during the fetch. + * Only Success outcomes have meaningful links; everything else either failed or returned no usable HTML. + */ + public function shouldRegisterOutboundLinks(): bool + { + return $this === self::Success; + } } diff --git a/app/Jobs/ProcessCrawlJob.php b/app/Jobs/ProcessCrawlJob.php index d2928d0..7b30a3f 100644 --- a/app/Jobs/ProcessCrawlJob.php +++ b/app/Jobs/ProcessCrawlJob.php @@ -1,10 +1,11 @@ pageCrawl->page->url); - $this->updatePageCrawl($result); + $this->writeOutcome($result); + $this->updatePageStatus($result); - $update = match ($result->outcome) { - CrawlOutcomeEnum::Rejected => [ - 'status' => PageStatusEnum::Rejected, - 'fetched_at' => null, - ], - CrawlOutcomeEnum::Timeout => [ - 'status' => PageStatusEnum::Failed, - 'failed_at' => now(), - ], - CrawlOutcomeEnum::Failed => [ - 'status' => PageStatusEnum::Failed, - ], - CrawlOutcomeEnum::Blocked4xx => [ - 'status' => PageStatusEnum::Failed, - 'failed_at' => now(), - ], - CrawlOutcomeEnum::Blocked5xx => [ - 'status' => PageStatusEnum::Failed, - ], - CrawlOutcomeEnum::BlockedRobots => [ - 'status' => PageStatusEnum::Failed, - ], - default => [ - 'status' => PageStatusEnum::Fetched, + if ($result->outcome->shouldRegisterOutboundLinks()) { + $result->outboundLinks->each(fn (string $url) => $register($url)); + } + + if ($result->outcome->isRetryable()) { + $this->scheduleRetryIfNeeded(); + } + } + + private function writeOutcome(FetchResult $result): void + { + $this->pageCrawl->update([ + 'outcome' => $result->outcome, + 'completed_at' => now(), + 'status_code' => $result->statusCode, + 'error_message' => $result->errorMessage, + ]); + } + + private function updatePageStatus(FetchResult $result): void + { + $status = $result->outcome->toPageStatus(); + + $update = match ($status) { + PageStatusEnum::Fetched => [ + 'status' => $status, 'fetched_at' => now(), 'title' => $result->title, ], + PageStatusEnum::Failed => [ + 'status' => $status, + 'failed_at' => now(), + ], + PageStatusEnum::Rejected => [ + 'status' => $status, + ], + PageStatusEnum::Discovered => [ + 'status' => $status, + ], }; $this->pageCrawl->page->update($update); - - if ($result->outcome !== CrawlOutcomeEnum::Failed) { - $result->outboundLinks->each(fn (string $url) => $register($url)); - } - - if (in_array($result->outcome, [ - CrawlOutcomeEnum::Failed, - CrawlOutcomeEnum::Timeout, - CrawlOutcomeEnum::Blocked5xx, - ])) { - $this->scheduleRetryIfNeeded($result, $this->pageCrawl); - } } - private function scheduleRetryIfNeeded(FetchResult $result, PageCrawl $crawl): void + private function scheduleRetryIfNeeded(): void { - if (PageCrawl::where('page_id', $crawl->page_id)->count() >= 3) { + if (PageCrawl::where('page_id', $this->pageCrawl->page_id)->count() >= 3) { return; } $newRow = PageCrawl::withoutEvents( fn () => PageCrawl::create( - array_merge($crawl->toArray(), [ + array_merge($this->pageCrawl->toArray(), [ 'outcome' => null, ]) ) @@ -101,24 +102,4 @@ private function scheduleRetryIfNeeded(FetchResult $result, PageCrawl $crawl): v ProcessCrawlJob::dispatch($newRow)->delay(now()->addHour()); } - - public function updatePageCrawl(FetchResult $result): void - { - $outcome = CrawlOutcomeEnum::Success; - $errorMessage = null; - $statusCode = 200; - - if ($result->outcome === CrawlOutcomeEnum::Failed) { - $outcome = CrawlOutcomeEnum::Failed; - $errorMessage = $result->errorMessage; - $statusCode = null; - } - - $this->pageCrawl->update([ - 'outcome' => $outcome, - 'completed_at' => now(), - 'status_code' => $statusCode, - 'error_message' => $errorMessage, - ]); - } } diff --git a/tests/Unit/Enums/CrawlOutcomeEnumTest.php b/tests/Unit/Enums/CrawlOutcomeEnumTest.php index 56261cb..17b214d 100644 --- a/tests/Unit/Enums/CrawlOutcomeEnumTest.php +++ b/tests/Unit/Enums/CrawlOutcomeEnumTest.php @@ -5,6 +5,7 @@ namespace Tests\Unit\Enums; use App\Enums\CrawlOutcomeEnum; +use App\Enums\PageStatusEnum; use Tests\TestCase; class CrawlOutcomeEnumTest extends TestCase @@ -33,4 +34,42 @@ public function test_enum_has_exactly_seven_cases(): void { $this->assertCount(7, CrawlOutcomeEnum::cases()); } + + public function test_to_page_status_maps_each_outcome_correctly(): void + { + $this->assertSame(PageStatusEnum::Fetched, CrawlOutcomeEnum::Success->toPageStatus()); + $this->assertSame(PageStatusEnum::Rejected, CrawlOutcomeEnum::Rejected->toPageStatus()); + $this->assertSame(PageStatusEnum::Failed, CrawlOutcomeEnum::Failed->toPageStatus()); + $this->assertSame(PageStatusEnum::Failed, CrawlOutcomeEnum::Timeout->toPageStatus()); + $this->assertSame(PageStatusEnum::Failed, CrawlOutcomeEnum::Blocked4xx->toPageStatus()); + $this->assertSame(PageStatusEnum::Failed, CrawlOutcomeEnum::Blocked5xx->toPageStatus()); + $this->assertSame(PageStatusEnum::Failed, CrawlOutcomeEnum::BlockedRobots->toPageStatus()); + } + + public function test_is_retryable_returns_true_only_for_transient_failures(): void + { + // Retryable: transient network/server problems that may resolve later + $this->assertTrue(CrawlOutcomeEnum::Failed->isRetryable()); + $this->assertTrue(CrawlOutcomeEnum::Timeout->isRetryable()); + $this->assertTrue(CrawlOutcomeEnum::Blocked5xx->isRetryable()); + + // Not retryable: success (done), permanent failures, or policy decisions + $this->assertFalse(CrawlOutcomeEnum::Success->isRetryable()); + $this->assertFalse(CrawlOutcomeEnum::Rejected->isRetryable()); + $this->assertFalse(CrawlOutcomeEnum::BlockedRobots->isRetryable()); + $this->assertFalse(CrawlOutcomeEnum::Blocked4xx->isRetryable()); + } + + public function test_should_register_outbound_links_returns_true_only_for_success(): void + { + $this->assertTrue(CrawlOutcomeEnum::Success->shouldRegisterOutboundLinks()); + + // No links to register on any non-Success outcome + $this->assertFalse(CrawlOutcomeEnum::Failed->shouldRegisterOutboundLinks()); + $this->assertFalse(CrawlOutcomeEnum::Timeout->shouldRegisterOutboundLinks()); + $this->assertFalse(CrawlOutcomeEnum::Rejected->shouldRegisterOutboundLinks()); + $this->assertFalse(CrawlOutcomeEnum::BlockedRobots->shouldRegisterOutboundLinks()); + $this->assertFalse(CrawlOutcomeEnum::Blocked4xx->shouldRegisterOutboundLinks()); + $this->assertFalse(CrawlOutcomeEnum::Blocked5xx->shouldRegisterOutboundLinks()); + } } From cda1414cd872d19a1d19f5d79b41d595ab76dbd4 Mon Sep 17 00:00:00 2001 From: myrmidex Date: Mon, 27 Apr 2026 23:53:52 +0200 Subject: [PATCH 49/65] 9 - Add robots.txt handling with cache and politeness integration --- app/Jobs/ProcessCrawlJob.php | 14 +++ app/Services/PolitenessService.php | 10 +- app/Services/RobotsService.php | 60 ++++++++++++ composer.json | 1 + composer.lock | 62 +++++++++++- config/crawler.php | 1 + tests/Feature/Jobs/ProcessCrawlJobTest.php | 90 +++++++++++++++++ tests/Unit/Services/PolitenessServiceTest.php | 33 +++++++ tests/Unit/Services/RobotsServiceTest.php | 96 +++++++++++++++++++ 9 files changed, 361 insertions(+), 6 deletions(-) create mode 100644 app/Services/RobotsService.php create mode 100644 tests/Unit/Services/RobotsServiceTest.php diff --git a/app/Jobs/ProcessCrawlJob.php b/app/Jobs/ProcessCrawlJob.php index 7b30a3f..ff700e2 100644 --- a/app/Jobs/ProcessCrawlJob.php +++ b/app/Jobs/ProcessCrawlJob.php @@ -6,9 +6,11 @@ use App\Actions\FetchPageAction; use App\Actions\RegisterDiscoveredPageAction; +use App\Enums\CrawlOutcomeEnum; use App\Enums\PageStatusEnum; use App\Models\PageCrawl; use App\Services\PolitenessService; +use App\Services\RobotsService; use App\ValueObjects\FetchResult; use Illuminate\Contracts\Queue\ShouldQueue; use Illuminate\Foundation\Queue\Queueable; @@ -24,6 +26,18 @@ public function __construct( public function handle(): void { + $robotsService = resolve(RobotsService::class); + + if (! $robotsService->isAllowed($this->pageCrawl->page->url)) { + $this->pageCrawl->update([ + 'outcome' => CrawlOutcomeEnum::BlockedRobots, + 'completed_at' => now(), + ]); + $this->pageCrawl->page->update(['status' => PageStatusEnum::Failed]); + + return; + } + $fetcher = resolve(FetchPageAction::class); $register = resolve(RegisterDiscoveredPageAction::class); $politenessService = resolve(PolitenessService::class); diff --git a/app/Services/PolitenessService.php b/app/Services/PolitenessService.php index 5114458..4d2b12b 100644 --- a/app/Services/PolitenessService.php +++ b/app/Services/PolitenessService.php @@ -8,12 +8,12 @@ class PolitenessService { public function minDelayFor(string $domain): int { - $configValue = config('crawler.min_domain_delay_seconds'); + /** @var RobotsService $robotsService */ + $robotsService = resolve(RobotsService::class); + $crawlDelay = $robotsService->crawlDelayFor($domain, config('crawler.user_agent')); - if ($configValue !== null) { - return $configValue; - } + $configValue = config('crawler.min_domain_delay_seconds', 10); - return 10; + return max($crawlDelay ?? 0, $configValue); } } diff --git a/app/Services/RobotsService.php b/app/Services/RobotsService.php new file mode 100644 index 0000000..f8b7f65 --- /dev/null +++ b/app/Services/RobotsService.php @@ -0,0 +1,60 @@ +urlService->host($url); + $path = parse_url($url, PHP_URL_PATH) ?? '/'; + + $body = Cache::remember( + "crawler:robots:{$host}", + config('crawler.robots_cache_ttl_seconds'), + function () use ($host) { + try { + $response = Http::get("https://{$host}/robots.txt"); + + return $response->successful() ? $response->body() : ''; + } catch (ConnectionException) { + return ''; + } + } + ); + + return (new RobotsTxt($body))->allows($path, $userAgent); + } + + public function crawlDelayFor(string $host, string $userAgent): ?int + { + $body = Cache::remember( + "crawler:robots:{$host}", + config('crawler.robots_cache_ttl_seconds'), + function () use ($host) { + try { + $response = Http::get("https://{$host}/robots.txt"); + + return $response->successful() ? $response->body() : ''; + } catch (ConnectionException) { + return ''; + } + } + ); + + $delay = (new RobotsTxt($body))->crawlDelay($userAgent); + + return $delay !== null ? (int) $delay : null; + } +} diff --git a/composer.json b/composer.json index de1ad17..6ba251e 100644 --- a/composer.json +++ b/composer.json @@ -21,6 +21,7 @@ "laravel/tinker": "^3.0", "livewire/livewire": "^4.2", "lvl0/fedi-discover": "@dev", + "spatie/robots-txt": "^2.5", "symfony/dom-crawler": "^7.4" }, "require-dev": { diff --git a/composer.lock b/composer.lock index e1fe116..d2b609b 100644 --- a/composer.lock +++ b/composer.lock @@ -4,7 +4,7 @@ "Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies", "This file is @generated automatically" ], - "content-hash": "2c63ed546b17b144997244f805e8a94a", + "content-hash": "707278fe3558199c1d07f11dba1d20ec", "packages": [ { "name": "brick/math", @@ -3549,6 +3549,66 @@ }, "time": "2025-12-14T04:43:48+00:00" }, + { + "name": "spatie/robots-txt", + "version": "2.5.4", + "source": { + "type": "git", + "url": "https://github.com/spatie/robots-txt.git", + "reference": "a8dd35d0a94e863f52509a366a634978e9c1db03" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/spatie/robots-txt/zipball/a8dd35d0a94e863f52509a366a634978e9c1db03", + "reference": "a8dd35d0a94e863f52509a366a634978e9c1db03", + "shasum": "" + }, + "require": { + "php": "^8.1" + }, + "require-dev": { + "phpunit/phpunit": "^11.5.2" + }, + "type": "library", + "autoload": { + "psr-4": { + "Spatie\\Robots\\": "src" + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "Brent Roose", + "email": "brent@spatie.be", + "homepage": "https://spatie.be", + "role": "Developer" + } + ], + "description": "Determine if a page may be crawled from robots.txt and robots meta tags", + "homepage": "https://github.com/spatie/robots-txt", + "keywords": [ + "robots-txt", + "spatie" + ], + "support": { + "issues": "https://github.com/spatie/robots-txt/issues", + "source": "https://github.com/spatie/robots-txt/tree/2.5.4" + }, + "funding": [ + { + "url": "https://spatie.be/open-source/support-us", + "type": "custom" + }, + { + "url": "https://github.com/spatie", + "type": "github" + } + ], + "time": "2026-02-25T07:59:20+00:00" + }, { "name": "symfony/clock", "version": "v7.4.8", diff --git a/config/crawler.php b/config/crawler.php index 108176f..f633ce5 100644 --- a/config/crawler.php +++ b/config/crawler.php @@ -43,4 +43,5 @@ 'user_agent' => env('CRAWLER_USER_AGENT', 'TroveBot/0.1 (+https://trove.lvl0.xyz/bot)'), 'min_domain_delay_seconds' => env('CRAWLER_MIN_DOMAIN_DELAY_SECONDS', 10), + 'robots_cache_ttl_seconds' => env('CRAWLER_ROBOTS_CACHE_TTL_SECONDS', 60 * 60 * 24), ]; diff --git a/tests/Feature/Jobs/ProcessCrawlJobTest.php b/tests/Feature/Jobs/ProcessCrawlJobTest.php index f504cb6..722f29f 100644 --- a/tests/Feature/Jobs/ProcessCrawlJobTest.php +++ b/tests/Feature/Jobs/ProcessCrawlJobTest.php @@ -15,6 +15,7 @@ use Illuminate\Foundation\Testing\RefreshDatabase; use Illuminate\Support\Collection; use Illuminate\Support\Facades\Cache; +use Illuminate\Support\Facades\Http; use Illuminate\Support\Facades\Queue; use Mockery; use Tests\TestCase; @@ -343,6 +344,46 @@ public function test_handle_does_not_release_lock_after_completion(): void $this->assertFalse($result, 'Expected the domain lock to still be held after handle() completed, but it was free — the lock was released prematurely.'); } + public function test_handle_writes_blocked_robots_when_disallowed(): void + { + Queue::fake(); + + Http::fake([ + 'https://example.com/robots.txt' => Http::response( + "User-agent: *\nDisallow: /", + 200, + ), + ]); + + // FetchPageAction must never be called — the robots gate returns before the lock + $fetcher = Mockery::mock(FetchPageAction::class); + $fetcher->shouldNotReceive('__invoke'); + $this->app->instance(FetchPageAction::class, $fetcher); + + $page = Page::factory()->createQuietly(['url' => 'https://example.com/private']); + $crawl = PageCrawl::factory()->page($page)->createQuietly(); + + $domain = $crawl->domain; + + app(ProcessCrawlJob::class, ['pageCrawl' => $crawl]) + ->handle(); + + // Outcome row must record BlockedRobots + $this->assertDatabaseHas('page_crawls', [ + 'id' => $crawl->id, + 'outcome' => CrawlOutcomeEnum::BlockedRobots->value, + ]); + + // Page status must be Failed (BlockedRobots::toPageStatus() === Failed) + $this->assertSame(PageStatusEnum::Failed, $page->fresh()->status); + + // The politeness lock must still be acquirable — the gate returned before ever claiming it + $this->assertTrue( + Cache::lock("crawler:domain:{$domain}", 10)->get(), + 'Expected the politeness lock to be free (gate returned before acquiring it), but it was already held.', + ); + } + public function test_handle_acquires_domain_lock_before_fetching(): void { Queue::fake(); @@ -367,6 +408,55 @@ public function test_handle_acquires_domain_lock_before_fetching(): void $this->assertSame(CrawlOutcomeEnum::Success, $crawl->fresh()->outcome); } + public function test_handle_proceeds_through_politeness_lock_when_robots_allow(): void + { + Queue::fake(); + + Http::fake([ + 'https://example.com/robots.txt' => Http::response( + "User-agent: *\nAllow: /", + 200, + ), + ]); + + // FetchPageAction must be called exactly once — robots gate passed, fetch proceeds + $fetcher = Mockery::mock(FetchPageAction::class); + $fetcher->shouldReceive('__invoke')->once()->andReturn(new FetchResult( + outcome: CrawlOutcomeEnum::Success, + statusCode: 200, + finalUrl: 'https://example.com/article', + title: 'Hello', + extractedText: 'hi', + outboundLinks: collect(), + wordCount: 1, + errorMessage: null, + )); + $this->app->instance(FetchPageAction::class, $fetcher); + + $page = Page::factory()->createQuietly(['url' => 'https://example.com/article']); + $crawl = PageCrawl::factory()->page($page)->createQuietly(); + + $domain = $crawl->domain; + + app(ProcessCrawlJob::class, ['pageCrawl' => $crawl]) + ->handle(); + + // Outcome must be Success — not BlockedRobots + $this->assertDatabaseHas('page_crawls', [ + 'id' => $crawl->id, + 'outcome' => CrawlOutcomeEnum::Success->value, + ]); + + // Page status must have advanced to Fetched + $this->assertSame(PageStatusEnum::Fetched, $page->fresh()->status); + + // Politeness lock must still be held (claimed during the fetch, never released) + $this->assertFalse( + Cache::lock("crawler:domain:{$domain}", 10)->get(), + 'Expected the politeness lock to be held after a successful fetch, but it was free.', + ); + } + private function mockFetchPageAction( CrawlOutcomeEnum $outcome, ?int $statusCode = null, diff --git a/tests/Unit/Services/PolitenessServiceTest.php b/tests/Unit/Services/PolitenessServiceTest.php index f9a2c6b..ce93fee 100644 --- a/tests/Unit/Services/PolitenessServiceTest.php +++ b/tests/Unit/Services/PolitenessServiceTest.php @@ -5,6 +5,7 @@ namespace Tests\Unit\Services; use App\Services\PolitenessService; +use Illuminate\Support\Facades\Http; use Tests\TestCase; class PolitenessServiceTest extends TestCase @@ -20,4 +21,36 @@ public function test_min_delay_for_respects_config_override(): void $this->assertSame(30, (new PolitenessService)->minDelayFor('example.com')); } + + public function test_min_delay_for_uses_robots_crawl_delay_when_higher(): void + { + Http::fake([ + 'https://example.com/robots.txt' => Http::response( + // Spatie does exact-token matching (lowercased), so the fixture UA + // must match the full string the service passes to crawlDelayFor(). + "User-agent: TroveBot/0.1 (+https://trove.lvl0.xyz/bot)\nCrawl-delay: 30", + 200, + ), + ]); + + config()->set('crawler.min_domain_delay_seconds', 10); + config()->set('crawler.user_agent', 'TroveBot/0.1 (+https://trove.lvl0.xyz/bot)'); + + $this->assertSame(30, app(PolitenessService::class)->minDelayFor('example.com')); + } + + public function test_min_delay_for_uses_config_when_higher_than_robots(): void + { + Http::fake([ + 'https://example.com/robots.txt' => Http::response( + "User-agent: TroveBot/0.1 (+https://trove.lvl0.xyz/bot)\nCrawl-delay: 10", + 200, + ), + ]); + + config()->set('crawler.min_domain_delay_seconds', 60); + config()->set('crawler.user_agent', 'TroveBot/0.1 (+https://trove.lvl0.xyz/bot)'); + + $this->assertSame(60, app(PolitenessService::class)->minDelayFor('example.com')); + } } diff --git a/tests/Unit/Services/RobotsServiceTest.php b/tests/Unit/Services/RobotsServiceTest.php new file mode 100644 index 0000000..746c173 --- /dev/null +++ b/tests/Unit/Services/RobotsServiceTest.php @@ -0,0 +1,96 @@ + Http::response( + "User-agent: *\nAllow: /", + 200, + ), + ]); + + $service = app(RobotsService::class); + + $this->assertTrue($service->isAllowed('https://example.com/article', 'TroveBot/0.1')); + } + + public function test_is_allowed_returns_false_when_robots_txt_disallows_path(): void + { + Http::fake([ + 'https://example.com/robots.txt' => Http::response( + "User-agent: *\nDisallow: /", + 200, + ), + ]); + + $service = app(RobotsService::class); + + $this->assertFalse($service->isAllowed('https://example.com/article', 'TroveBot/0.1')); + } + + public function test_is_allowed_returns_true_when_robots_txt_fetch_fails(): void + { + Http::fake([ + 'https://example.com/robots.txt' => Http::response('', 500), + ]); + + $service = app(RobotsService::class); + + $this->assertTrue($service->isAllowed('https://example.com/article', 'TroveBot/0.1')); + } + + public function test_is_allowed_caches_robots_txt_body_per_host(): void + { + Http::fake([ + 'https://example.com/robots.txt' => Http::response( + "User-agent: *\nAllow: /", + 200, + ), + ]); + + $service = app(RobotsService::class); + + $service->isAllowed('https://example.com/article', 'TroveBot/0.1'); + $service->isAllowed('https://example.com/another-article', 'TroveBot/0.1'); + + Http::assertSentCount(1); + } + + public function test_crawl_delay_for_returns_parsed_value(): void + { + Http::fake([ + 'https://example.com/robots.txt' => Http::response( + "User-agent: TroveBot/0.1\nCrawl-delay: 30", + 200, + ), + ]); + + $service = app(RobotsService::class); + + $this->assertSame(30, $service->crawlDelayFor('example.com', 'TroveBot/0.1')); + } + + public function test_crawl_delay_for_returns_null_when_absent(): void + { + Http::fake([ + 'https://example.com/robots.txt' => Http::response( + "User-agent: *\nDisallow: /private", + 200, + ), + ]); + + $service = app(RobotsService::class); + + $this->assertNull($service->crawlDelayFor('example.com', 'TroveBot/0.1')); + } +} From a37b1da1457995e01dc4865a177e16942b8e3338 Mon Sep 17 00:00:00 2001 From: myrmidex Date: Tue, 28 Apr 2026 00:13:14 +0200 Subject: [PATCH 50/65] 13 - Add language_confidence column and Page model cast --- app/Models/Page.php | 2 ++ .../2026_04_25_234157_create_pages_table.php | 1 + tests/Unit/Models/PageTest.php | 20 +++++++++++++++++++ 3 files changed, 23 insertions(+) diff --git a/app/Models/Page.php b/app/Models/Page.php index 02a0a8e..52a131c 100644 --- a/app/Models/Page.php +++ b/app/Models/Page.php @@ -25,6 +25,7 @@ class Page extends Model 'url', 'status', 'language', + 'language_confidence', 'title', 'instance_id', 'posted_at', @@ -34,6 +35,7 @@ class Page extends Model protected $casts = [ 'status' => PageStatusEnum::class, + 'language_confidence' => 'float', 'posted_at' => 'datetime', 'fetched_at' => 'datetime', 'failed_at' => 'datetime', diff --git a/database/migrations/2026_04_25_234157_create_pages_table.php b/database/migrations/2026_04_25_234157_create_pages_table.php index e1df51f..2379f87 100644 --- a/database/migrations/2026_04_25_234157_create_pages_table.php +++ b/database/migrations/2026_04_25_234157_create_pages_table.php @@ -16,6 +16,7 @@ public function up(): void $table->text('url')->unique(); $table->string('status')->default(PageStatusEnum::Discovered->value)->index(); $table->string('language', 35)->nullable()->index(); + $table->decimal('language_confidence', 4, 3)->nullable(); $table->string('title')->nullable(); $table->foreignId('instance_id') ->nullable() diff --git a/tests/Unit/Models/PageTest.php b/tests/Unit/Models/PageTest.php index 3e08b56..95645ad 100644 --- a/tests/Unit/Models/PageTest.php +++ b/tests/Unit/Models/PageTest.php @@ -152,6 +152,26 @@ public function test_page_latest_crawl_returns_row_with_latest_created_at(): voi $this->assertSame('sentinel-latest', $latest->error_message); } + public function test_language_confidence_is_fillable_nullable_and_cast_to_float(): void + { + // Column must exist, be nullable (null round-trips cleanly), be mass-assignable, + // and the 'float' cast must be applied so we get a PHP float back, not a string. + $withConfidence = Page::factory()->createQuietly([ + 'language' => 'en', + 'language_confidence' => 0.857, + ]); + + $fresh = $withConfidence->fresh(); + + $this->assertNotNull($fresh); + $this->assertIsFloat($fresh->language_confidence); + $this->assertEqualsWithDelta(0.857, $fresh->language_confidence, 0.001); + + $withoutConfidence = Page::factory()->createQuietly(); + + $this->assertNull($withoutConfidence->fresh()->language_confidence); + } + public function test_page_status_is_cast_to_enum(): void { $cases = [ From 829ce04282b42308ff78073420f6ebf1c8018d97 Mon Sep 17 00:00:00 2001 From: myrmidex Date: Tue, 28 Apr 2026 00:32:54 +0200 Subject: [PATCH 51/65] 13 - Add LanguageDetectionService wrapping patrickschur/language-detection --- app/Providers/AppServiceProvider.php | 9 +-- app/Services/LanguageDetectionService.php | 39 ++++++++++ composer.json | 1 + composer.lock | 53 ++++++++++++- .../Services/LanguageDetectionServiceTest.php | 74 +++++++++++++++++++ 5 files changed, 168 insertions(+), 8 deletions(-) create mode 100644 app/Services/LanguageDetectionService.php create mode 100644 tests/Unit/Services/LanguageDetectionServiceTest.php diff --git a/app/Providers/AppServiceProvider.php b/app/Providers/AppServiceProvider.php index 5cafe3e..30eaf8a 100644 --- a/app/Providers/AppServiceProvider.php +++ b/app/Providers/AppServiceProvider.php @@ -3,23 +3,18 @@ namespace App\Providers; use App\Listeners\UrlDiscoveredListener; +use App\Services\LanguageDetectionService; use Illuminate\Support\Facades\Event; use Illuminate\Support\ServiceProvider; use Lvl0\FediDiscover\Events\UrlDiscovered; class AppServiceProvider extends ServiceProvider { - /** - * Register any application services. - */ public function register(): void { - // + $this->app->singleton(LanguageDetectionService::class); } - /** - * Bootstrap any application services. - */ public function boot(): void { Event::listen(UrlDiscovered::class, UrlDiscoveredListener::class); diff --git a/app/Services/LanguageDetectionService.php b/app/Services/LanguageDetectionService.php new file mode 100644 index 0000000..2724ea0 --- /dev/null +++ b/app/Services/LanguageDetectionService.php @@ -0,0 +1,39 @@ +language = new Language; + } + + /** + * @return array{0: string, 1: float}|null + */ + public function detect(string $text): ?array + { + if (trim($text) === '') { + return null; + } + + $languages = $this->language->detect($text)->bestResults()->close(); + + if ($languages === []) { + return null; + } + + // bestResults() keeps every candidate within 0.025 of the top score. + // array_key_first picks the highest-ranked one (arsort'd by the library). + $code = array_key_first($languages); + + return [$code, $languages[$code]]; + } +} diff --git a/composer.json b/composer.json index 6ba251e..9af1143 100644 --- a/composer.json +++ b/composer.json @@ -21,6 +21,7 @@ "laravel/tinker": "^3.0", "livewire/livewire": "^4.2", "lvl0/fedi-discover": "@dev", + "patrickschur/language-detection": "^5.3", "spatie/robots-txt": "^2.5", "symfony/dom-crawler": "^7.4" }, diff --git a/composer.lock b/composer.lock index d2b609b..51ecdd2 100644 --- a/composer.lock +++ b/composer.lock @@ -4,7 +4,7 @@ "Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies", "This file is @generated automatically" ], - "content-hash": "707278fe3558199c1d07f11dba1d20ec", + "content-hash": "4d6e239c94fea8e9511f1e73f05db1df", "packages": [ { "name": "brick/math", @@ -2785,6 +2785,57 @@ ], "time": "2026-02-16T23:10:27+00:00" }, + { + "name": "patrickschur/language-detection", + "version": "v5.3.1", + "source": { + "type": "git", + "url": "https://github.com/patrickschur/language-detection.git", + "reference": "df8d32021b2ef9fde52e6fcccb83e3806822c9c6" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/patrickschur/language-detection/zipball/df8d32021b2ef9fde52e6fcccb83e3806822c9c6", + "reference": "df8d32021b2ef9fde52e6fcccb83e3806822c9c6", + "shasum": "" + }, + "require": { + "ext-json": "*", + "ext-mbstring": "*", + "php": "^7.4 || ^8.0" + }, + "require-dev": { + "phpunit/phpunit": "^9.5.0" + }, + "type": "library", + "autoload": { + "psr-4": { + "LanguageDetection\\": "src/LanguageDetection" + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "Patrick Schur", + "email": "patrick_schur@outlook.de" + } + ], + "description": "A language detection library for PHP. Detects the language from a given text string.", + "homepage": "https://github.com/patrickschur/language-detection", + "keywords": [ + "detect", + "detection", + "language" + ], + "support": { + "issues": "https://github.com/patrickschur/language-detection/issues", + "source": "https://github.com/patrickschur/language-detection/tree/v5.3.1" + }, + "time": "2025-03-25T22:47:08+00:00" + }, { "name": "phpoption/phpoption", "version": "1.9.5", diff --git a/tests/Unit/Services/LanguageDetectionServiceTest.php b/tests/Unit/Services/LanguageDetectionServiceTest.php new file mode 100644 index 0000000..62d4453 --- /dev/null +++ b/tests/Unit/Services/LanguageDetectionServiceTest.php @@ -0,0 +1,74 @@ +service = new LanguageDetectionService; + } + + public function test_detects_english_from_english_paragraph(): void + { + $text = 'The solar system is the gravitationally bound system of the Sun and the + objects that orbit it. Of the bodies that orbit the Sun directly, the largest + are the eight planets, with the remainder being smaller objects, the dwarf + planets and small solar system bodies. Planets and most other large bodies + in the solar system orbit the Sun in the same direction, counterclockwise + when viewed from above the Sun\'s north pole.'; + + $result = $this->service->detect($text); + + $this->assertIsArray($result); + $this->assertCount(2, $result); + $this->assertTrue( + str_starts_with($result[0], 'en'), + "Expected an English-family tag, got '{$result[0]}'.", + ); + $this->assertIsFloat($result[1]); + $this->assertGreaterThan(0.0, $result[1]); + $this->assertLessThanOrEqual(1.0, $result[1]); + } + + public function test_detects_portuguese_from_portuguese_paragraph(): void + { + $text = 'O sistema solar é o sistema gravitacionalmente ligado composto pelo Sol e + pelos objetos que orbitam ao seu redor. Dos corpos que orbitam o Sol + diretamente, os maiores são os oito planetas, sendo o restante composto por + objetos menores, como planetas anões e corpos menores do sistema solar. + A Terra é o único planeta conhecido a abrigar vida, possuindo uma atmosfera + rica em nitrogênio e oxigênio que sustenta os seres vivos.'; + + $result = $this->service->detect($text); + + $this->assertIsArray($result); + $this->assertCount(2, $result); + $this->assertTrue( + str_starts_with($result[0], 'pt'), + "Expected a Portuguese-family tag, got '{$result[0]}'.", + ); + $this->assertIsFloat($result[1]); + $this->assertGreaterThan(0.0, $result[1]); + $this->assertLessThanOrEqual(1.0, $result[1]); + } + + public function test_returns_null_for_empty_string(): void + { + $this->assertNull($this->service->detect('')); + } + + public function test_returns_null_for_whitespace_only_string(): void + { + $this->assertNull($this->service->detect(' ')); + } +} From cb83b0df9057bf8072af09a0a8e115e955b3641c Mon Sep 17 00:00:00 2001 From: myrmidex Date: Tue, 28 Apr 2026 00:38:46 +0200 Subject: [PATCH 52/65] 13 - Add language and languageConfidence fields to FetchResult --- app/ValueObjects/FetchResult.php | 2 ++ tests/Unit/ValueObjects/FetchResultTest.php | 8 ++++++++ 2 files changed, 10 insertions(+) diff --git a/app/ValueObjects/FetchResult.php b/app/ValueObjects/FetchResult.php index d79cdae..3514b37 100644 --- a/app/ValueObjects/FetchResult.php +++ b/app/ValueObjects/FetchResult.php @@ -22,5 +22,7 @@ public function __construct( public Collection $outboundLinks, public ?int $wordCount, public ?string $errorMessage, + public ?string $language = null, + public ?float $languageConfidence = null, ) {} } diff --git a/tests/Unit/ValueObjects/FetchResultTest.php b/tests/Unit/ValueObjects/FetchResultTest.php index c3185f8..463dcb7 100644 --- a/tests/Unit/ValueObjects/FetchResultTest.php +++ b/tests/Unit/ValueObjects/FetchResultTest.php @@ -22,6 +22,8 @@ public function test_it_exposes_all_fields(): void outboundLinks: collect(['https://other.com', 'https://another.com']), wordCount: 5, errorMessage: null, + language: 'en', + languageConfidence: 0.95, ); $this->assertSame(CrawlOutcomeEnum::Success, $result->outcome); @@ -33,6 +35,8 @@ public function test_it_exposes_all_fields(): void $this->assertSame(['https://other.com', 'https://another.com'], $result->outboundLinks->all()); $this->assertSame(5, $result->wordCount); $this->assertNull($result->errorMessage); + $this->assertSame('en', $result->language); + $this->assertSame(0.95, $result->languageConfidence); } public function test_it_accepts_null_for_failure_outcome_fields(): void @@ -46,6 +50,8 @@ public function test_it_accepts_null_for_failure_outcome_fields(): void outboundLinks: collect(), wordCount: null, errorMessage: 'Could not connect', + language: null, + languageConfidence: null, ); $this->assertSame(CrawlOutcomeEnum::Failed, $result->outcome); @@ -56,5 +62,7 @@ public function test_it_accepts_null_for_failure_outcome_fields(): void $this->assertSame([], $result->outboundLinks->all()); $this->assertNull($result->wordCount); $this->assertSame('Could not connect', $result->errorMessage); + $this->assertNull($result->language); + $this->assertNull($result->languageConfidence); } } From 81b3c7f70bdc770e231e07ed7ef345ad98711187 Mon Sep 17 00:00:00 2001 From: myrmidex Date: Tue, 28 Apr 2026 01:09:18 +0200 Subject: [PATCH 53/65] 13 - Wire LanguageDetectionService into FetchPageAction with lang attr fallback --- app/Actions/FetchPageAction.php | 36 +++- tests/Feature/Actions/FetchPageActionTest.php | 181 ++++++++++++++++++ 2 files changed, 215 insertions(+), 2 deletions(-) diff --git a/app/Actions/FetchPageAction.php b/app/Actions/FetchPageAction.php index ec92a8d..e906b15 100644 --- a/app/Actions/FetchPageAction.php +++ b/app/Actions/FetchPageAction.php @@ -5,6 +5,7 @@ namespace App\Actions; use App\Enums\CrawlOutcomeEnum; +use App\Services\LanguageDetectionService; use App\Services\UrlService; use App\ValueObjects\FetchResult; use fivefilters\Readability\Configuration; @@ -20,9 +21,14 @@ class FetchPageAction { + private const MIN_WORDS_FOR_TEXT_DETECTION = 20; + + private const MIN_TEXT_DETECTION_CONFIDENCE = 0.30; + public function __construct( private Factory $http, private UrlService $urlService, + private LanguageDetectionService $languageDetection, ) {} public function __invoke(string $url): FetchResult @@ -46,8 +52,9 @@ public function __invoke(string $url): FetchResult [$outcome, $error] = $this->validateResponse($response); if ($outcome === CrawlOutcomeEnum::Success) { - [$title, $extractedText, $links] = $this->extractTitleTextAndLinks($response->body(), $url); + [$title, $extractedText, $links, $crawler] = $this->extractTitleTextAndLinks($response->body(), $url); $wordCount = $extractedText !== '' ? count(preg_split('/\s+/u', trim($extractedText))) : 0; + [$language, $languageConfidence] = $this->detectLanguage($crawler, $extractedText, $wordCount); } return new FetchResult( @@ -59,6 +66,8 @@ public function __invoke(string $url): FetchResult outboundLinks: $links ?? collect(), wordCount: $wordCount ?? null, errorMessage: $error ?? null, + language: $language ?? null, + languageConfidence: $languageConfidence ?? null, ); } @@ -135,7 +144,7 @@ private function extractTitleTextAndLinks(string $body, string $url): array ->unique() ->values(); - return [$title, $extractedText, $linksResolved]; + return [$title, $extractedText, $linksResolved, $crawler]; } private function resolveAndValidateLink(string $href, string $finalUrl): ?string @@ -159,4 +168,27 @@ private function resolveAndValidateLink(string $href, string $finalUrl): ?string return $resolved; } + + /** + * @return array{0: ?string, 1: ?float} + */ + private function detectLanguage(Crawler $crawler, string $extractedText, ?int $wordCount = null): array + { + if ($wordCount >= self::MIN_WORDS_FOR_TEXT_DETECTION) { + $result = $this->languageDetection->detect($extractedText); + if ($result !== null && $result[1] >= self::MIN_TEXT_DETECTION_CONFIDENCE) { + return [$result[0], $result[1]]; + } + } + + $lang = $crawler->filter('html')->count() > 0 + ? trim($crawler->filter('html')->attr('lang') ?? '') + : ''; + + if ($lang !== '' && strlen($lang) <= 35) { + return [$lang, 1.0]; + } + + return [null, null]; + } } diff --git a/tests/Feature/Actions/FetchPageActionTest.php b/tests/Feature/Actions/FetchPageActionTest.php index b5f415a..826c755 100644 --- a/tests/Feature/Actions/FetchPageActionTest.php +++ b/tests/Feature/Actions/FetchPageActionTest.php @@ -6,6 +6,7 @@ use App\Actions\FetchPageAction; use App\Enums\CrawlOutcomeEnum; +use App\Services\LanguageDetectionService; use App\ValueObjects\FetchResult; use GuzzleHttp\Exception\ConnectException; use GuzzleHttp\Psr7\Request; @@ -323,6 +324,186 @@ public function test_fragment_only_href_is_filtered_from_outbound_links(): void $this->assertSame(0, $result->outboundLinks->count()); } + public function test_sufficient_text_triggers_language_detection_and_result_propagates(): void + { + // 24 words — above the detection threshold + $body = <<<'HTML' + + + Language Detection Test + +
+

The quick brown fox jumps over the lazy dog and then runs away into the forest + where many other animals live and play together every single day.

+
+ + + HTML; + + Http::fake([ + 'example.com/*' => Http::response($body, 200, ['Content-Type' => 'text/html']), + ]); + + $detection = $this->mock(LanguageDetectionService::class); + $detection->shouldReceive('detect') + ->once() + ->andReturn(['en', 0.95]); + + $result = $this->makeAction()('https://example.com/article'); + + $this->assertSame(CrawlOutcomeEnum::Success, $result->outcome); + $this->assertSame('en', $result->language); + $this->assertSame(0.95, $result->languageConfidence); + } + + public function test_short_body_with_html_lang_attr_skips_service_and_uses_lang_attr(): void + { + // 7 words — below the detection threshold + $html = <<<'HTML' + + + Short Page + +
+

Too short to detect language automatically.

+
+ + + HTML; + + Http::fake([ + 'example.com/*' => Http::response($html, 200, ['Content-Type' => 'text/html']), + ]); + + $detection = $this->mock(LanguageDetectionService::class); + $detection->shouldReceive('detect')->never(); + + $result = $this->makeAction()('https://example.com/article'); + + $this->assertSame(CrawlOutcomeEnum::Success, $result->outcome); + $this->assertSame('pt-BR', $result->language); + $this->assertSame(1.0, $result->languageConfidence); + } + + public function test_short_body_with_no_lang_attr_returns_null_language(): void + { + // 7 words — below the detection threshold + $html = <<<'HTML' + + + Short Page + +
+

Too short to detect language automatically.

+
+ + + HTML; + + Http::fake([ + 'example.com/*' => Http::response($html, 200, ['Content-Type' => 'text/html']), + ]); + + $detection = $this->mock(LanguageDetectionService::class); + $detection->shouldReceive('detect')->never(); + + $result = $this->makeAction()('https://example.com/article'); + + $this->assertSame(CrawlOutcomeEnum::Success, $result->outcome); + $this->assertNull($result->language); + $this->assertNull($result->languageConfidence); + } + + public function test_whitespace_only_lang_attr_is_treated_as_absent(): void + { + // 7 words — below the detection threshold; lang attr is blank/whitespace-only + $html = <<<'HTML' + + + Short Page + +
+

Too short to detect language automatically.

+
+ + + HTML; + + Http::fake([ + 'example.com/*' => Http::response($html, 200, ['Content-Type' => 'text/html']), + ]); + + $detection = $this->mock(LanguageDetectionService::class); + $detection->shouldReceive('detect')->never(); + + $result = $this->makeAction()('https://example.com/article'); + + $this->assertSame(CrawlOutcomeEnum::Success, $result->outcome); + $this->assertNull($result->language); + $this->assertNull($result->languageConfidence); + } + + public function test_lang_attr_longer_than_35_chars_is_rejected(): void + { + // 7 words — below the detection threshold; lang attr exceeds BCP-47 column width (string(35)) + $html = <<<'HTML' + + + Short Page + +
+

Too short to detect language automatically.

+
+ + + HTML; + + Http::fake([ + 'example.com/*' => Http::response($html, 200, ['Content-Type' => 'text/html']), + ]); + + $detection = $this->mock(LanguageDetectionService::class); + $detection->shouldReceive('detect')->never(); + + $result = $this->makeAction()('https://example.com/article'); + + $this->assertSame(CrawlOutcomeEnum::Success, $result->outcome); + $this->assertNull($result->language); + $this->assertNull($result->languageConfidence); + } + + public function test_low_confidence_detection_falls_through_to_lang_attr(): void + { + // 24 words — above the detection threshold; service returns low-confidence result + $html = <<<'HTML' + + + Confidence Floor Test + +
+

The quick brown fox jumps over the lazy dog and then runs away into the forest + where many other animals live and play together every single day.

+
+ + + HTML; + + Http::fake([ + 'example.com/*' => Http::response($html, 200, ['Content-Type' => 'text/html']), + ]); + + $detection = $this->mock(LanguageDetectionService::class); + $detection->shouldReceive('detect') + ->once() + ->andReturn(['xx', 0.15]); + + $result = $this->makeAction()('https://example.com/article'); + + $this->assertSame(CrawlOutcomeEnum::Success, $result->outcome); + $this->assertSame('en-US', $result->language); + $this->assertSame(1.0, $result->languageConfidence); + } + private function makeAction(): FetchPageAction { return app(FetchPageAction::class); From 1cba8f3fc9da29a3838f5e7fd7f70dd03033cdd7 Mon Sep 17 00:00:00 2001 From: myrmidex Date: Tue, 28 Apr 2026 01:20:13 +0200 Subject: [PATCH 54/65] 13 - Persist detected language and confidence on Page after successful fetch --- app/Jobs/ProcessCrawlJob.php | 2 + tests/Feature/Jobs/ProcessCrawlJobTest.php | 54 ++++++++++++++++++++++ 2 files changed, 56 insertions(+) diff --git a/app/Jobs/ProcessCrawlJob.php b/app/Jobs/ProcessCrawlJob.php index ff700e2..071bd49 100644 --- a/app/Jobs/ProcessCrawlJob.php +++ b/app/Jobs/ProcessCrawlJob.php @@ -84,6 +84,8 @@ private function updatePageStatus(FetchResult $result): void 'status' => $status, 'fetched_at' => now(), 'title' => $result->title, + 'language' => $result->language, + 'language_confidence' => $result->languageConfidence, ], PageStatusEnum::Failed => [ 'status' => $status, diff --git a/tests/Feature/Jobs/ProcessCrawlJobTest.php b/tests/Feature/Jobs/ProcessCrawlJobTest.php index 722f29f..bf353e6 100644 --- a/tests/Feature/Jobs/ProcessCrawlJobTest.php +++ b/tests/Feature/Jobs/ProcessCrawlJobTest.php @@ -457,6 +457,56 @@ public function test_handle_proceeds_through_politeness_lock_when_robots_allow() ); } + public function test_handle_persists_language_on_success(): void + { + Queue::fake(); + + $this->mockFetchPageAction( + CrawlOutcomeEnum::Success, + statusCode: 200, + title: 'Hello', + extractedText: 'hi', + wordCount: 1, + language: 'en', + languageConfidence: 0.95, + ); + + $page = Page::factory()->createQuietly(['url' => 'https://example.com/article']); + $crawl = PageCrawl::factory()->page($page)->createQuietly(); + + app(ProcessCrawlJob::class, ['pageCrawl' => $crawl]) + ->handle(); + + $fresh = $page->fresh(); + $this->assertSame('en', $fresh->language); + $this->assertEqualsWithDelta(0.95, $fresh->language_confidence, 0.001); + } + + public function test_handle_persists_null_language_on_success(): void + { + Queue::fake(); + + $this->mockFetchPageAction( + CrawlOutcomeEnum::Success, + statusCode: 200, + title: 'Hello', + extractedText: 'hi', + wordCount: 1, + language: null, + languageConfidence: null, + ); + + $page = Page::factory()->createQuietly(['url' => 'https://example.com/article']); + $crawl = PageCrawl::factory()->page($page)->createQuietly(); + + app(ProcessCrawlJob::class, ['pageCrawl' => $crawl]) + ->handle(); + + $fresh = $page->fresh(); + $this->assertNull($fresh->language); + $this->assertNull($fresh->language_confidence); + } + private function mockFetchPageAction( CrawlOutcomeEnum $outcome, ?int $statusCode = null, @@ -466,6 +516,8 @@ private function mockFetchPageAction( ?Collection $outboundLinks = null, ?int $wordCount = null, ?string $errorMessage = null, + ?string $language = null, + ?float $languageConfidence = null, ): void { $fetcher = Mockery::mock(FetchPageAction::class); $fetcher->shouldReceive('__invoke')->andReturn(new FetchResult( @@ -477,6 +529,8 @@ private function mockFetchPageAction( outboundLinks: $outboundLinks ?? collect(), wordCount: $wordCount, errorMessage: $errorMessage, + language: $language, + languageConfidence: $languageConfidence, )); $this->app->instance(FetchPageAction::class, $fetcher); } From a59c086da2b2920e73db0a3b31c8f1885dcdef4e Mon Sep 17 00:00:00 2001 From: myrmidex Date: Tue, 28 Apr 2026 01:32:38 +0200 Subject: [PATCH 55/65] 13 - Make page language sticky across re-crawls when new fetch returns null --- app/Jobs/ProcessCrawlJob.php | 10 ++++-- tests/Feature/Jobs/ProcessCrawlJobTest.php | 38 +++++++++++++++++++++- 2 files changed, 45 insertions(+), 3 deletions(-) diff --git a/app/Jobs/ProcessCrawlJob.php b/app/Jobs/ProcessCrawlJob.php index 071bd49..2c15b0c 100644 --- a/app/Jobs/ProcessCrawlJob.php +++ b/app/Jobs/ProcessCrawlJob.php @@ -84,8 +84,14 @@ private function updatePageStatus(FetchResult $result): void 'status' => $status, 'fetched_at' => now(), 'title' => $result->title, - 'language' => $result->language, - 'language_confidence' => $result->languageConfidence, + // Sticky language: only write when detection produced a value, so a re-crawl + // returning null doesn't erase a previously-detected language. Guarding on + // language alone is sufficient because FetchPageAction::detectLanguage() + // always returns the pair as both-null or both-non-null (never mixed). + ...($result->language !== null ? [ + 'language' => $result->language, + 'language_confidence' => $result->languageConfidence, + ] : []), ], PageStatusEnum::Failed => [ 'status' => $status, diff --git a/tests/Feature/Jobs/ProcessCrawlJobTest.php b/tests/Feature/Jobs/ProcessCrawlJobTest.php index bf353e6..4f07f80 100644 --- a/tests/Feature/Jobs/ProcessCrawlJobTest.php +++ b/tests/Feature/Jobs/ProcessCrawlJobTest.php @@ -482,7 +482,43 @@ public function test_handle_persists_language_on_success(): void $this->assertEqualsWithDelta(0.95, $fresh->language_confidence, 0.001); } - public function test_handle_persists_null_language_on_success(): void + public function test_handle_does_not_overwrite_existing_language_when_new_fetch_returns_null(): void + { + Queue::fake(); + + $this->mockFetchPageAction( + CrawlOutcomeEnum::Success, + statusCode: 200, + title: 'Hello', + extractedText: 'hi', + wordCount: 1, + language: null, + languageConfidence: null, + ); + + // Page already has a language from a previous fetch + $page = Page::factory()->createQuietly([ + 'url' => 'https://example.com/article', + 'language' => 'en', + 'language_confidence' => 0.95, + ]); + $crawl = PageCrawl::factory()->page($page)->createQuietly(); + + app(ProcessCrawlJob::class, ['pageCrawl' => $crawl]) + ->handle(); + + $fresh = $page->fresh(); + + // Language columns must be sticky — null detection must NOT overwrite them + $this->assertSame('en', $fresh->language); + $this->assertEqualsWithDelta(0.95, $fresh->language_confidence, 0.001); + + // Other columns must still update — sticky applies to language only + $this->assertSame(PageStatusEnum::Fetched, $fresh->status); + $this->assertSame('Hello', $fresh->title); + } + + public function test_handle_leaves_language_null_when_no_prior_and_no_detection(): void { Queue::fake(); From 9cecc47b8bce64f31c3b22d2f722a4ee2dcb8aa8 Mon Sep 17 00:00:00 2001 From: myrmidex Date: Tue, 28 Apr 2026 18:47:04 +0200 Subject: [PATCH 56/65] 6 - Log structured success entry on PollFediverseAction with url count and duration --- .../src/Actions/PollFediverseAction.php | 48 ++++++++++++------- .../tests/Feature/PollFediverseActionTest.php | 24 ++++++++++ 2 files changed, 54 insertions(+), 18 deletions(-) diff --git a/packages/Lvl0/FediDiscover/src/Actions/PollFediverseAction.php b/packages/Lvl0/FediDiscover/src/Actions/PollFediverseAction.php index 11d8767..34cadb5 100644 --- a/packages/Lvl0/FediDiscover/src/Actions/PollFediverseAction.php +++ b/packages/Lvl0/FediDiscover/src/Actions/PollFediverseAction.php @@ -5,6 +5,7 @@ namespace Lvl0\FediDiscover\Actions; use Carbon\CarbonImmutable; +use Illuminate\Support\Collection; use Illuminate\Support\Facades\Log; use Lvl0\FediDiscover\Clients\FediverseClientFactory; use Lvl0\FediDiscover\Events\UrlDiscovered; @@ -18,22 +19,26 @@ public function __construct(private FediverseClientFactory $factory) {} public function execute(Instance $instance): void { + $start = microtime(true); + $client = $this->factory->for($instance); $posts = $client->fetchPostsSince($instance, $instance->last_seen_id); - $posts->each(function (FediversePost $post) use ($instance) { - try { - $this->processLinks($post, $instance); - } catch (Throwable $e) { - Log::warning('fedi-discover:processLinks failed', [ - 'instance_id' => $instance->id, - 'instance_url' => $instance->url, - 'post_url' => $post->selfUrl, - 'exception' => $e::class, - 'message' => $e->getMessage(), - ]); - } - }); + $urlCount = $posts + ->map(function (FediversePost $post) use ($instance) { + try { + return $this->processLinks($post, $instance); + } catch (Throwable $e) { + Log::warning('fedi-discover:processLinks failed', [ + 'instance_id' => $instance->id, + 'instance_url' => $instance->url, + 'post_url' => $post->selfUrl, + 'exception' => $e::class, + 'message' => $e->getMessage(), + ]); + } + }) + ->sum(); if ($posts->isNotEmpty()) { $instance->last_seen_id = $posts->first()->cursorId; @@ -41,21 +46,27 @@ public function execute(Instance $instance): void $instance->last_polled_at = now(); $instance->save(); + + Log::info('fedi-discover:poll succeeded', [ + 'instance_id' => $instance->id, + 'url_count' => $urlCount, + 'duration_ms' => (int) round((microtime(true) - $start) * 1000), + ]); } - private function processLinks(FediversePost $post, Instance $instance): void + private function processLinks(FediversePost $post, Instance $instance): int { if ($post->body === null) { - return; + return 0; } $linksFound = preg_match_all('~https?://[^\s<>"\'()\[\]]+~', $post->body, $matches); if ($linksFound === 0) { - return; + return 0; } - collect($matches[0]) + return collect($matches[0]) ->map(fn (string $u) => rtrim($u, '.,;:!?')) ->filter(fn (string $u) => filter_var($u, FILTER_VALIDATE_URL) !== false) ->filter(fn (string $u) => parse_url($u, PHP_URL_HOST) !== parse_url($instance->url, PHP_URL_HOST)) @@ -66,6 +77,7 @@ private function processLinks(FediversePost $post, Instance $instance): void discoveredAt: CarbonImmutable::now(), postUrl: $post->selfUrl, postBody: $post->body, - )); + )) + ->count(); } } diff --git a/packages/Lvl0/FediDiscover/tests/Feature/PollFediverseActionTest.php b/packages/Lvl0/FediDiscover/tests/Feature/PollFediverseActionTest.php index 231e16c..8524fb4 100644 --- a/packages/Lvl0/FediDiscover/tests/Feature/PollFediverseActionTest.php +++ b/packages/Lvl0/FediDiscover/tests/Feature/PollFediverseActionTest.php @@ -7,6 +7,7 @@ use Carbon\CarbonImmutable; use Illuminate\Foundation\Testing\RefreshDatabase; use Illuminate\Support\Facades\Event; +use Illuminate\Support\Facades\Log; use Lvl0\FediDiscover\Actions\PollFediverseAction; use Lvl0\FediDiscover\Clients\FediverseClientFactory; use Lvl0\FediDiscover\Clients\FediverseClientInterface; @@ -198,6 +199,29 @@ public function test_it_leaves_last_seen_id_unchanged_when_no_posts_are_returned $this->assertSame('500', $instance->fresh()->last_seen_id); } + public function test_poll_logs_a_structured_success_entry_with_url_count_and_duration(): void + { + Log::spy(); + Event::fake([UrlDiscovered::class]); + + $instance = $this->makeInstance(); + + $this->pollInstance($instance, [ + new FediversePost('1', 'https://mastodon.social/@alice/1', 'See https://example.com/one and https://other.example/two'), + new FediversePost('2', 'https://mastodon.social/@bob/2', 'Also https://example.com/three'), + ]); + + Log::shouldHaveReceived('info') + ->once() + ->withArgs(function (string $message, array $context) use ($instance): bool { + return $message === 'fedi-discover:poll succeeded' + && $context['instance_id'] === $instance->id + && $context['url_count'] === 3 + && isset($context['duration_ms']) + && $context['duration_ms'] >= 0; + }); + } + /** * @param array $posts */ From 6e097acf883eb3a4d589afa8a03394a0a9075b18 Mon Sep 17 00:00:00 2001 From: myrmidex Date: Tue, 28 Apr 2026 18:48:20 +0200 Subject: [PATCH 57/65] 6 - Add admin instances page listing url and last poll --- .../Controllers/Admin/InstancesController.php | 18 ++++++ resources/views/admin/index.blade.php | 22 ++++++++ resources/views/layouts/layout.blade.php | 8 +++ routes/web.php | 3 + .../Feature/Admin/InstancesAdminPageTest.php | 56 +++++++++++++++++++ 5 files changed, 107 insertions(+) create mode 100644 app/Http/Controllers/Admin/InstancesController.php create mode 100644 resources/views/admin/index.blade.php create mode 100644 resources/views/layouts/layout.blade.php create mode 100644 tests/Feature/Admin/InstancesAdminPageTest.php diff --git a/app/Http/Controllers/Admin/InstancesController.php b/app/Http/Controllers/Admin/InstancesController.php new file mode 100644 index 0000000..4cbdf88 --- /dev/null +++ b/app/Http/Controllers/Admin/InstancesController.php @@ -0,0 +1,18 @@ +get(); + + return view('admin.index', ['instances' => $instances]); + } +} diff --git a/resources/views/admin/index.blade.php b/resources/views/admin/index.blade.php new file mode 100644 index 0000000..05d00bf --- /dev/null +++ b/resources/views/admin/index.blade.php @@ -0,0 +1,22 @@ + +
+

Instances

+ + + + + + + + + + @foreach($instances as $instance) + + + + + @endforeach + +
InstanceLast polled at
{{ $instance->url }}{{ $instance->last_polled_at }}
+
+
diff --git a/resources/views/layouts/layout.blade.php b/resources/views/layouts/layout.blade.php new file mode 100644 index 0000000..5179da6 --- /dev/null +++ b/resources/views/layouts/layout.blade.php @@ -0,0 +1,8 @@ + + + {{ $title ?? 'Trove' }} + + + {{ $slot }} + + diff --git a/routes/web.php b/routes/web.php index 6199875..6b7b768 100644 --- a/routes/web.php +++ b/routes/web.php @@ -2,6 +2,7 @@ declare(strict_types=1); +use App\Http\Controllers\Admin\InstancesController; use Illuminate\Support\Facades\Route; Route::get('/', function () { @@ -11,3 +12,5 @@ Route::view('/submit', 'urls.submit'); Route::view('/bot', 'bot'); + +Route::get('/admin/instances', [InstancesController::class, 'index'])->name('admin.instances'); diff --git a/tests/Feature/Admin/InstancesAdminPageTest.php b/tests/Feature/Admin/InstancesAdminPageTest.php new file mode 100644 index 0000000..88c7670 --- /dev/null +++ b/tests/Feature/Admin/InstancesAdminPageTest.php @@ -0,0 +1,56 @@ +get('/admin/instances'); + + $response->assertStatus(200); + } + + // ------------------------------------------------------------------------- + // Test 4 — admin instances page lists each instance's URL and last_polled_at + // ------------------------------------------------------------------------- + + public function test_admin_instances_page_shows_each_instance_url_and_last_polled_at(): void + { + $mastodon = Instance::factory() + ->type(InstanceType::Mastodon) + ->enabled() + ->create([ + 'url' => 'https://mastodon.social', + 'last_polled_at' => '2024-06-01 12:00:00', + ]); + + $lemmy = Instance::factory() + ->type(InstanceType::Lemmy) + ->enabled() + ->create([ + 'url' => 'https://lemmy.world', + 'last_polled_at' => '2024-06-01 13:00:00', + ]); + + $response = $this->get('/admin/instances'); + + $response->assertSee($mastodon->url); + $response->assertSee($lemmy->url); + $response->assertSee($mastodon->last_polled_at->toDateString()); + $response->assertSee($lemmy->last_polled_at->toDateString()); + } +} From 257dbfcf5f5c82b5e635d5709d7b9c89d4bdf75e Mon Sep 17 00:00:00 2001 From: myrmidex Date: Tue, 28 Apr 2026 19:51:42 +0200 Subject: [PATCH 58/65] chore - remove example tests --- tests/Feature/ExampleTest.php | 19 ------------------- tests/Unit/ExampleTest.php | 16 ---------------- 2 files changed, 35 deletions(-) delete mode 100644 tests/Feature/ExampleTest.php delete mode 100644 tests/Unit/ExampleTest.php diff --git a/tests/Feature/ExampleTest.php b/tests/Feature/ExampleTest.php deleted file mode 100644 index 8364a84..0000000 --- a/tests/Feature/ExampleTest.php +++ /dev/null @@ -1,19 +0,0 @@ -get('/'); - - $response->assertStatus(200); - } -} diff --git a/tests/Unit/ExampleTest.php b/tests/Unit/ExampleTest.php deleted file mode 100644 index 5773b0c..0000000 --- a/tests/Unit/ExampleTest.php +++ /dev/null @@ -1,16 +0,0 @@ -assertTrue(true); - } -} From f9cebe5bae149b98878257223222ae7ed69b3967 Mon Sep 17 00:00:00 2001 From: myrmidex Date: Tue, 28 Apr 2026 23:33:32 +0200 Subject: [PATCH 59/65] 6 - Add admin instances page and PollAlertService failure tracking --- .../Controllers/Admin/InstancesController.php | 6 +- app/Services/PollAlertService.php | 14 +++ ...7_create_fedi_discover_instances_table.php | 1 + .../Lvl0/FediDiscover/src/Models/Instance.php | 11 ++- resources/views/admin/index.blade.php | 4 + .../Feature/Admin/InstancesAdminPageTest.php | 92 +++++++++++++++++-- .../Feature/Services/PollAlertServiceTest.php | 32 +++++++ 7 files changed, 150 insertions(+), 10 deletions(-) create mode 100644 app/Services/PollAlertService.php create mode 100644 tests/Feature/Services/PollAlertServiceTest.php diff --git a/app/Http/Controllers/Admin/InstancesController.php b/app/Http/Controllers/Admin/InstancesController.php index 4cbdf88..d47702d 100644 --- a/app/Http/Controllers/Admin/InstancesController.php +++ b/app/Http/Controllers/Admin/InstancesController.php @@ -3,6 +3,7 @@ namespace App\Http\Controllers\Admin; +use App\Enums\PageStatusEnum; use App\Http\Controllers\Controller; use Illuminate\View\View; use Lvl0\FediDiscover\Models\Instance; @@ -11,7 +12,10 @@ class InstancesController extends Controller { public function index(): View { - $instances = Instance::orderBy('url', 'asc')->get(); + $instances = Instance::withCount([ + 'pages', + 'pages as failed_pages_count' => fn ($q) => $q->where('status', PageStatusEnum::Failed), + ])->orderBy('url', 'asc')->get(); return view('admin.index', ['instances' => $instances]); } diff --git a/app/Services/PollAlertService.php b/app/Services/PollAlertService.php new file mode 100644 index 0000000..44b470b --- /dev/null +++ b/app/Services/PollAlertService.php @@ -0,0 +1,14 @@ +increment('consecutive_poll_failures'); + } +} diff --git a/packages/Lvl0/FediDiscover/database/migrations/2026_04_23_205027_create_fedi_discover_instances_table.php b/packages/Lvl0/FediDiscover/database/migrations/2026_04_23_205027_create_fedi_discover_instances_table.php index 5a9fb60..209c3f7 100644 --- a/packages/Lvl0/FediDiscover/database/migrations/2026_04_23_205027_create_fedi_discover_instances_table.php +++ b/packages/Lvl0/FediDiscover/database/migrations/2026_04_23_205027_create_fedi_discover_instances_table.php @@ -18,6 +18,7 @@ public function up(): void $table->boolean('enabled')->default(true); $table->unsignedInteger('interval_seconds')->default(300); $table->json('extras')->default('{}'); + $table->unsignedInteger('consecutive_poll_failures')->default(0); $table->timestampTz('last_polled_at')->nullable(); $table->string('last_seen_id')->nullable(); $table->timestamps(); diff --git a/packages/Lvl0/FediDiscover/src/Models/Instance.php b/packages/Lvl0/FediDiscover/src/Models/Instance.php index a7211e5..f2c6b87 100644 --- a/packages/Lvl0/FediDiscover/src/Models/Instance.php +++ b/packages/Lvl0/FediDiscover/src/Models/Instance.php @@ -4,10 +4,12 @@ namespace Lvl0\FediDiscover\Models; +use App\Models\Page; use Illuminate\Database\Eloquent\Builder; use Illuminate\Database\Eloquent\Factories\Factory; use Illuminate\Database\Eloquent\Factories\HasFactory; use Illuminate\Database\Eloquent\Model; +use Illuminate\Database\Eloquent\Relations\HasMany; use Illuminate\Support\Carbon; use Lvl0\FediDiscover\Config\InstanceType; use Lvl0\FediDiscover\Database\Factories\InstanceFactory; @@ -20,6 +22,7 @@ * @property int $interval_seconds * @property array $extras * @property string|null $last_seen_id + * @property int $consecutive_poll_failures * @property Carbon|null $last_polled_at * @property Carbon $created_at * @property Carbon $updated_at @@ -31,7 +34,7 @@ class Instance extends Model protected $table = 'fedi_discover_instances'; - protected $fillable = ['type', 'url', 'enabled', 'interval_seconds', 'extras', 'last_seen_id', 'last_polled_at']; + protected $fillable = ['type', 'url', 'enabled', 'interval_seconds', 'extras', 'last_seen_id', 'last_polled_at', 'consecutive_poll_failures']; protected $casts = [ 'type' => InstanceType::class, @@ -53,4 +56,10 @@ protected static function newFactory(): Factory { return InstanceFactory::new(); } + + public function pages(): HasMany + { + return $this->hasMany(Page::class); + } + } diff --git a/resources/views/admin/index.blade.php b/resources/views/admin/index.blade.php index 05d00bf..721bca7 100644 --- a/resources/views/admin/index.blade.php +++ b/resources/views/admin/index.blade.php @@ -7,6 +7,8 @@ Instance Last polled at + URLs + Errors @@ -14,6 +16,8 @@ {{ $instance->url }} {{ $instance->last_polled_at }} + {{ $instance->pages_count }} URLs + {{ $instance->failed_pages_count }} errors @endforeach diff --git a/tests/Feature/Admin/InstancesAdminPageTest.php b/tests/Feature/Admin/InstancesAdminPageTest.php index 88c7670..46a0070 100644 --- a/tests/Feature/Admin/InstancesAdminPageTest.php +++ b/tests/Feature/Admin/InstancesAdminPageTest.php @@ -4,6 +4,7 @@ namespace Tests\Feature\Admin; +use App\Models\Page; use Illuminate\Foundation\Testing\RefreshDatabase; use Lvl0\FediDiscover\Config\InstanceType; use Lvl0\FediDiscover\Models\Instance; @@ -13,10 +14,6 @@ class InstancesAdminPageTest extends TestCase { use RefreshDatabase; - // ------------------------------------------------------------------------- - // Test 3 — admin instances page is accessible - // ------------------------------------------------------------------------- - public function test_admin_instances_page_is_accessible(): void { $response = $this->get('/admin/instances'); @@ -24,10 +21,6 @@ public function test_admin_instances_page_is_accessible(): void $response->assertStatus(200); } - // ------------------------------------------------------------------------- - // Test 4 — admin instances page lists each instance's URL and last_polled_at - // ------------------------------------------------------------------------- - public function test_admin_instances_page_shows_each_instance_url_and_last_polled_at(): void { $mastodon = Instance::factory() @@ -53,4 +46,87 @@ public function test_admin_instances_page_shows_each_instance_url_and_last_polle $response->assertSee($mastodon->last_polled_at->toDateString()); $response->assertSee($lemmy->last_polled_at->toDateString()); } + + public function test_admin_instances_page_shows_error_count_per_instance(): void + { + $first = Instance::factory() + ->type(InstanceType::Mastodon) + ->enabled() + ->create(['url' => 'https://aardvark.example']); + + $second = Instance::factory() + ->type(InstanceType::Lemmy) + ->enabled() + ->create(['url' => 'https://zebra.example']); + + // First instance: 3 failed + 2 non-failed pages + Page::factory() + ->count(3) + ->sequence(fn ($s) => ['url' => "https://aardvark.example/fail-{$s->index}"]) + ->createQuietly(['instance_id' => $first->id, 'status' => \App\Enums\PageStatusEnum::Failed]); + + Page::factory() + ->count(2) + ->sequence(fn ($s) => ['url' => "https://aardvark.example/ok-{$s->index}"]) + ->createQuietly(['instance_id' => $first->id, 'status' => \App\Enums\PageStatusEnum::Fetched]); + + // Second instance: 1 failed + 4 non-failed pages + Page::factory() + ->count(1) + ->sequence(fn ($s) => ['url' => "https://zebra.example/fail-{$s->index}"]) + ->createQuietly(['instance_id' => $second->id, 'status' => \App\Enums\PageStatusEnum::Failed]); + + Page::factory() + ->count(4) + ->sequence(fn ($s) => ['url' => "https://zebra.example/ok-{$s->index}"]) + ->createQuietly(['instance_id' => $second->id, 'status' => \App\Enums\PageStatusEnum::Fetched]); + + $response = $this->get('/admin/instances'); + + // Each error-count cell must render as "{n} errors" — this string cannot + // collide with dates, IDs, or the "URLs" column. The counts (3 and 1) + // are distinct and non-equal so the assertion proves per-row mapping, + // not a leaked total. + $response->assertSeeInOrder([ + $first->url, + '3 errors', + $second->url, + '1 errors', + ]); + } + + public function test_admin_instances_page_shows_url_count_per_instance(): void + { + $first = Instance::factory() + ->type(InstanceType::Mastodon) + ->enabled() + ->create(['url' => 'https://aardvark.example']); + + $second = Instance::factory() + ->type(InstanceType::Lemmy) + ->enabled() + ->create(['url' => 'https://zebra.example']); + + Page::factory() + ->count(7) + ->sequence(fn ($s) => ['url' => "https://aardvark.example/page-{$s->index}"]) + ->createQuietly(['instance_id' => $first->id]); + + Page::factory() + ->count(2) + ->sequence(fn ($s) => ['url' => "https://zebra.example/page-{$s->index}"]) + ->createQuietly(['instance_id' => $second->id]); + + $response = $this->get('/admin/instances'); + + // Each count cell must render as "{n} URLs" — this string cannot + // collide with dates, IDs, or any other incidental numeric content, + // so the assertion only passes when a real count column is wired in. + $response->assertSeeInOrder([ + $first->url, + '7 URLs', + $second->url, + '2 URLs', + ]); + } } diff --git a/tests/Feature/Services/PollAlertServiceTest.php b/tests/Feature/Services/PollAlertServiceTest.php new file mode 100644 index 0000000..a1e7a5e --- /dev/null +++ b/tests/Feature/Services/PollAlertServiceTest.php @@ -0,0 +1,32 @@ +type(InstanceType::Mastodon) + ->enabled() + ->create(['consecutive_poll_failures' => 0]); + + $service = new PollAlertService(); + $service->recordFailure($instance); + + $this->assertDatabaseHas('fedi_discover_instances', [ + 'id' => $instance->id, + 'consecutive_poll_failures' => 1, + ]); + } +} From 920985eec86251a0d42b17d12fe8862649f8c84e Mon Sep 17 00:00:00 2001 From: myrmidex Date: Wed, 29 Apr 2026 21:07:43 +0200 Subject: [PATCH 60/65] 6 - Convert blade views to layouts/app inheritance --- resources/views/admin/index.blade.php | 7 +- resources/views/bot.blade.php | 7 +- .../app.blade.php} | 4 +- resources/views/layouts/layout.blade.php | 8 - resources/views/urls/submit.blade.php | 8 +- resources/views/welcome.blade.php | 228 +----------------- 6 files changed, 22 insertions(+), 240 deletions(-) rename resources/views/{components/layout.blade.php => layouts/app.blade.php} (82%) delete mode 100644 resources/views/layouts/layout.blade.php diff --git a/resources/views/admin/index.blade.php b/resources/views/admin/index.blade.php index 721bca7..033c409 100644 --- a/resources/views/admin/index.blade.php +++ b/resources/views/admin/index.blade.php @@ -1,4 +1,6 @@ - +@extends('layouts.app') + +@section('content')

Instances

@@ -23,4 +25,5 @@
-
+@endsection + diff --git a/resources/views/bot.blade.php b/resources/views/bot.blade.php index 4859039..ad4ef5d 100644 --- a/resources/views/bot.blade.php +++ b/resources/views/bot.blade.php @@ -1,4 +1,7 @@ - +@extends('layouts.app') + +@section('content') +

About TroveBot

@@ -57,4 +60,4 @@
-
+@endsection diff --git a/resources/views/components/layout.blade.php b/resources/views/layouts/app.blade.php similarity index 82% rename from resources/views/components/layout.blade.php rename to resources/views/layouts/app.blade.php index 5100c31..8e6f01a 100644 --- a/resources/views/components/layout.blade.php +++ b/resources/views/layouts/app.blade.php @@ -4,14 +4,14 @@ - {{ $title ?? config('app.name') }} + Trove @yield('title') @vite(['resources/css/app.css', 'resources/js/app.js']) @livewireStyles - {{ $slot }} + @yield('content') @livewireScripts diff --git a/resources/views/layouts/layout.blade.php b/resources/views/layouts/layout.blade.php deleted file mode 100644 index 5179da6..0000000 --- a/resources/views/layouts/layout.blade.php +++ /dev/null @@ -1,8 +0,0 @@ - - - {{ $title ?? 'Trove' }} - - - {{ $slot }} - - diff --git a/resources/views/urls/submit.blade.php b/resources/views/urls/submit.blade.php index 266ab36..1385d93 100644 --- a/resources/views/urls/submit.blade.php +++ b/resources/views/urls/submit.blade.php @@ -1,3 +1,7 @@ - +@extends('layouts.app') + +@section('content') + - + +@endsection diff --git a/resources/views/welcome.blade.php b/resources/views/welcome.blade.php index 2c2e7c2..9c0de81 100644 --- a/resources/views/welcome.blade.php +++ b/resources/views/welcome.blade.php @@ -1,225 +1,5 @@ - - - - - +@extends('layouts.app') - {{ config('app.name', 'Laravel') }} - - - - - - - @if (file_exists(public_path('build/manifest.json')) || file_exists(public_path('hot'))) - @vite(['resources/css/app.css', 'resources/js/app.js']) - @else - - @endif - - -
- @if (Route::has('login')) - - @endif -
-
-
-
-

Let's get started

-

With so many options available to you,
we suggest you start with the following:

- - - -

- v{{ app()->version() }} - - View changelog - - - - -

-
-
- {{-- Laravel Logo --}} - - - - - - - - - - - {{-- 13 --}} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
-
-
- - @if (Route::has('login')) - - @endif - - +@section('content') + Welcome +@endsection From bbd74c1954d4d531eaf3cf7b090a4f8ce39e387c Mon Sep 17 00:00:00 2001 From: myrmidex Date: Wed, 29 Apr 2026 21:09:44 +0200 Subject: [PATCH 61/65] 6 - Reset consecutive_poll_failures on successful poll --- .../FediDiscover/src/Actions/PollFediverseAction.php | 2 +- .../tests/Feature/PollFediverseActionTest.php | 9 +++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/packages/Lvl0/FediDiscover/src/Actions/PollFediverseAction.php b/packages/Lvl0/FediDiscover/src/Actions/PollFediverseAction.php index 34cadb5..d55da41 100644 --- a/packages/Lvl0/FediDiscover/src/Actions/PollFediverseAction.php +++ b/packages/Lvl0/FediDiscover/src/Actions/PollFediverseAction.php @@ -5,7 +5,6 @@ namespace Lvl0\FediDiscover\Actions; use Carbon\CarbonImmutable; -use Illuminate\Support\Collection; use Illuminate\Support\Facades\Log; use Lvl0\FediDiscover\Clients\FediverseClientFactory; use Lvl0\FediDiscover\Events\UrlDiscovered; @@ -44,6 +43,7 @@ public function execute(Instance $instance): void $instance->last_seen_id = $posts->first()->cursorId; } + $instance->consecutive_poll_failures = 0; $instance->last_polled_at = now(); $instance->save(); diff --git a/packages/Lvl0/FediDiscover/tests/Feature/PollFediverseActionTest.php b/packages/Lvl0/FediDiscover/tests/Feature/PollFediverseActionTest.php index 8524fb4..0056d44 100644 --- a/packages/Lvl0/FediDiscover/tests/Feature/PollFediverseActionTest.php +++ b/packages/Lvl0/FediDiscover/tests/Feature/PollFediverseActionTest.php @@ -199,6 +199,15 @@ public function test_it_leaves_last_seen_id_unchanged_when_no_posts_are_returned $this->assertSame('500', $instance->fresh()->last_seen_id); } + public function test_consecutive_poll_failures_reset_to_zero_after_successful_poll(): void + { + $instance = $this->makeInstance(['consecutive_poll_failures' => 5]); + + $this->pollInstance($instance, []); + + $this->assertSame(0, $instance->fresh()->consecutive_poll_failures); + } + public function test_poll_logs_a_structured_success_entry_with_url_count_and_duration(): void { Log::spy(); From 8d063a8262ce11a025d358b3273f0f727b03ed0c Mon Sep 17 00:00:00 2001 From: myrmidex Date: Wed, 29 Apr 2026 21:15:11 +0200 Subject: [PATCH 62/65] 6 - Add PollFailed event to FediDiscover package --- .../FediDiscover/src/Events/PollFailed.php | 23 ++++++++++++++ .../tests/Unit/PollFailedTest.php | 31 +++++++++++++++++++ 2 files changed, 54 insertions(+) create mode 100644 packages/Lvl0/FediDiscover/src/Events/PollFailed.php create mode 100644 packages/Lvl0/FediDiscover/tests/Unit/PollFailedTest.php diff --git a/packages/Lvl0/FediDiscover/src/Events/PollFailed.php b/packages/Lvl0/FediDiscover/src/Events/PollFailed.php new file mode 100644 index 0000000..56c7b55 --- /dev/null +++ b/packages/Lvl0/FediDiscover/src/Events/PollFailed.php @@ -0,0 +1,23 @@ +id = 7; + + $failedAt = CarbonImmutable::parse('2026-04-28T09:00:00'); + + $event = new PollFailed( + instance: $instance, + message: 'Connection timed out', + failedAt: $failedAt, + ); + + $this->assertSame($instance, $event->instance); + $this->assertSame('Connection timed out', $event->message); + $this->assertTrue($failedAt->eq($event->failedAt)); + } +} From 6ab175a466b928eab894787c95f423c20944c192 Mon Sep 17 00:00:00 2001 From: myrmidex Date: Wed, 29 Apr 2026 21:25:11 +0200 Subject: [PATCH 63/65] 6 - Send ntfy alert when poll failures cross threshold --- .env.example | 4 + .../Controllers/Admin/InstancesController.php | 1 + app/Services/PollAlertService.php | 26 +++- config/services.php | 6 + .../Lvl0/FediDiscover/src/Models/Instance.php | 1 - .../Feature/Admin/InstancesAdminPageTest.php | 9 +- .../Feature/Services/PollAlertServiceTest.php | 145 +++++++++++++++++- 7 files changed, 183 insertions(+), 9 deletions(-) diff --git a/.env.example b/.env.example index f83cd0b..ac89b76 100644 --- a/.env.example +++ b/.env.example @@ -63,3 +63,7 @@ AWS_USE_PATH_STYLE_ENDPOINT=false VITE_APP_NAME="${APP_NAME}" CRAWLER_MIN_DOMAIN_DELAY_SECONDS=10 + +NTFY_URL= +NTFY_TOPIC= +NTFY_THRESHOLD= diff --git a/app/Http/Controllers/Admin/InstancesController.php b/app/Http/Controllers/Admin/InstancesController.php index d47702d..266fe7b 100644 --- a/app/Http/Controllers/Admin/InstancesController.php +++ b/app/Http/Controllers/Admin/InstancesController.php @@ -1,4 +1,5 @@ increment('consecutive_poll_failures'); + $instance->refresh(); + + $ntfyUrl = config('services.ntfy.url'); + $ntfyThreshold = (int) config('services.ntfy.threshold'); + $ntfyTopic = config('services.ntfy.topic'); + + if ($ntfyUrl === null || $ntfyThreshold === 0 || $ntfyTopic === null) { + return; + } + + if ($instance->consecutive_poll_failures < $ntfyThreshold) { + return; + } + + try { + Http::timeout(5) + ->withBody($instance->url . ' - ' . $message, 'text/plain') + ->post(rtrim($ntfyUrl, '/') . '/' . $ntfyTopic); + } catch (Exception $e) { + logger()->warning('ntfy alert failed', ['instance' => $instance->url, 'error' => $e->getMessage()]); + } } } diff --git a/config/services.php b/config/services.php index 6a90eb8..93fd034 100644 --- a/config/services.php +++ b/config/services.php @@ -14,6 +14,12 @@ | */ + 'ntfy' => [ + 'url' => env('NTFY_URL') ?: null, + 'topic' => env('NTFY_TOPIC') ?: null, + 'threshold' => env('NTFY_THRESHOLD'), + ], + 'postmark' => [ 'key' => env('POSTMARK_API_KEY'), ], diff --git a/packages/Lvl0/FediDiscover/src/Models/Instance.php b/packages/Lvl0/FediDiscover/src/Models/Instance.php index f2c6b87..9d61119 100644 --- a/packages/Lvl0/FediDiscover/src/Models/Instance.php +++ b/packages/Lvl0/FediDiscover/src/Models/Instance.php @@ -61,5 +61,4 @@ public function pages(): HasMany { return $this->hasMany(Page::class); } - } diff --git a/tests/Feature/Admin/InstancesAdminPageTest.php b/tests/Feature/Admin/InstancesAdminPageTest.php index 46a0070..fb633d7 100644 --- a/tests/Feature/Admin/InstancesAdminPageTest.php +++ b/tests/Feature/Admin/InstancesAdminPageTest.php @@ -4,6 +4,7 @@ namespace Tests\Feature\Admin; +use App\Enums\PageStatusEnum; use App\Models\Page; use Illuminate\Foundation\Testing\RefreshDatabase; use Lvl0\FediDiscover\Config\InstanceType; @@ -63,23 +64,23 @@ public function test_admin_instances_page_shows_error_count_per_instance(): void Page::factory() ->count(3) ->sequence(fn ($s) => ['url' => "https://aardvark.example/fail-{$s->index}"]) - ->createQuietly(['instance_id' => $first->id, 'status' => \App\Enums\PageStatusEnum::Failed]); + ->createQuietly(['instance_id' => $first->id, 'status' => PageStatusEnum::Failed]); Page::factory() ->count(2) ->sequence(fn ($s) => ['url' => "https://aardvark.example/ok-{$s->index}"]) - ->createQuietly(['instance_id' => $first->id, 'status' => \App\Enums\PageStatusEnum::Fetched]); + ->createQuietly(['instance_id' => $first->id, 'status' => PageStatusEnum::Fetched]); // Second instance: 1 failed + 4 non-failed pages Page::factory() ->count(1) ->sequence(fn ($s) => ['url' => "https://zebra.example/fail-{$s->index}"]) - ->createQuietly(['instance_id' => $second->id, 'status' => \App\Enums\PageStatusEnum::Failed]); + ->createQuietly(['instance_id' => $second->id, 'status' => PageStatusEnum::Failed]); Page::factory() ->count(4) ->sequence(fn ($s) => ['url' => "https://zebra.example/ok-{$s->index}"]) - ->createQuietly(['instance_id' => $second->id, 'status' => \App\Enums\PageStatusEnum::Fetched]); + ->createQuietly(['instance_id' => $second->id, 'status' => PageStatusEnum::Fetched]); $response = $this->get('/admin/instances'); diff --git a/tests/Feature/Services/PollAlertServiceTest.php b/tests/Feature/Services/PollAlertServiceTest.php index a1e7a5e..714f359 100644 --- a/tests/Feature/Services/PollAlertServiceTest.php +++ b/tests/Feature/Services/PollAlertServiceTest.php @@ -6,6 +6,7 @@ use App\Services\PollAlertService; use Illuminate\Foundation\Testing\RefreshDatabase; +use Illuminate\Support\Facades\Http; use Lvl0\FediDiscover\Config\InstanceType; use Lvl0\FediDiscover\Models\Instance; use Tests\TestCase; @@ -14,19 +15,157 @@ class PollAlertServiceTest extends TestCase { use RefreshDatabase; - public function test_recordFailure_increments_consecutive_poll_failures_on_the_instance(): void + public function test_record_failure_increments_consecutive_poll_failures_on_the_instance(): void { $instance = Instance::factory() ->type(InstanceType::Mastodon) ->enabled() ->create(['consecutive_poll_failures' => 0]); - $service = new PollAlertService(); - $service->recordFailure($instance); + $service = new PollAlertService; + $service->recordFailure($instance, 'test'); $this->assertDatabaseHas('fedi_discover_instances', [ 'id' => $instance->id, 'consecutive_poll_failures' => 1, ]); } + + public function test_no_alert_sent_below_threshold(): void + { + Http::fake(); + + config([ + 'services.ntfy.url' => 'https://ntfy.example.com', + 'services.ntfy.topic' => 'trove-alerts', + 'services.ntfy.threshold' => 3, + ]); + + $instance = Instance::factory() + ->type(InstanceType::Mastodon) + ->enabled() + ->create(['consecutive_poll_failures' => 1]); // will become 2 after recordFailure + + $service = new PollAlertService; + $service->recordFailure($instance, 'test'); + + Http::assertNothingSent(); + } + + public function test_alert_sent_when_threshold_is_reached(): void + { + Http::fake(); + + config([ + 'services.ntfy.url' => 'https://ntfy.example.com', + 'services.ntfy.topic' => 'trove-alerts', + 'services.ntfy.threshold' => 3, + ]); + + $instance = Instance::factory() + ->type(InstanceType::Mastodon) + ->enabled() + ->create(['consecutive_poll_failures' => 2]); // will become 3 after recordFailure = exactly at threshold + + $service = new PollAlertService; + $service->recordFailure($instance, 'test'); + + Http::assertSent(function ($request) { + return $request->url() === 'https://ntfy.example.com/trove-alerts' + && $request->method() === 'POST'; + }); + } + + public function test_alert_sent_when_count_exceeds_threshold(): void + { + Http::fake(); + + config([ + 'services.ntfy.url' => 'https://ntfy.example.com', + 'services.ntfy.topic' => 'trove-alerts', + 'services.ntfy.threshold' => 3, + ]); + + $instance = Instance::factory() + ->type(InstanceType::Mastodon) + ->enabled() + ->create(['consecutive_poll_failures' => 3]); // will become 4 after recordFailure = above threshold + + $service = new PollAlertService; + $service->recordFailure($instance, 'test'); + + Http::assertSent(function ($request) { + return $request->url() === 'https://ntfy.example.com/trove-alerts' + && $request->method() === 'POST'; + }); + } + + public function test_no_alert_sent_when_threshold_is_zero(): void + { + Http::fake(); + + config([ + 'services.ntfy.url' => 'https://ntfy.example.com', + 'services.ntfy.topic' => 'trove-alerts', + 'services.ntfy.threshold' => 0, + ]); + + $instance = Instance::factory() + ->type(InstanceType::Mastodon) + ->enabled() + ->create(['consecutive_poll_failures' => 5]); + + $service = new PollAlertService; + $service->recordFailure($instance, 'test'); + + Http::assertNothingSent(); + } + + public function test_no_alert_sent_when_topic_is_null(): void + { + Http::fake(); + + config([ + 'services.ntfy.url' => 'https://ntfy.example.com', + 'services.ntfy.topic' => null, + 'services.ntfy.threshold' => 3, + ]); + + $instance = Instance::factory() + ->type(InstanceType::Mastodon) + ->enabled() + ->create(['consecutive_poll_failures' => 2]); // will become 3 after recordFailure = at threshold + + $service = new PollAlertService; + $service->recordFailure($instance, 'test'); + + Http::assertNothingSent(); + } + + public function test_alert_body_contains_instance_url_and_message(): void + { + Http::fake(); + + config([ + 'services.ntfy.url' => 'https://ntfy.example.com', + 'services.ntfy.topic' => 'trove-alerts', + 'services.ntfy.threshold' => 3, + ]); + + $instance = Instance::factory() + ->type(InstanceType::Mastodon) + ->enabled() + ->create([ + 'url' => 'https://mastodon.social', + 'consecutive_poll_failures' => 2, // will become 3 = at threshold + ]); + + $service = new PollAlertService; + $service->recordFailure($instance, 'connection refused after 3 retries'); + + Http::assertSent(function ($request) { + return str_contains($request->body(), 'https://mastodon.social') + && str_contains($request->body(), 'connection refused after 3 retries'); + }); + } } From 31a53de9fbc4412178a3784c57f5470555b8ca0d Mon Sep 17 00:00:00 2001 From: myrmidex Date: Wed, 29 Apr 2026 23:21:02 +0200 Subject: [PATCH 64/65] 6 - Wire PollFailed dispatch and listener --- app/Listeners/PollFailedListener.php | 18 +++++++ app/Providers/AppServiceProvider.php | 3 ++ .../Console/Commands/PollInstancesCommand.php | 29 +++++++---- .../Feature/PollInstancesCommandTest.php | 48 +++++++++++++++++ .../Listeners/PollFailedListenerTest.php | 52 +++++++++++++++++++ tests/Feature/PollFailedIntegrationTest.php | 37 +++++++++++++ 6 files changed, 177 insertions(+), 10 deletions(-) create mode 100644 app/Listeners/PollFailedListener.php create mode 100644 tests/Feature/Listeners/PollFailedListenerTest.php create mode 100644 tests/Feature/PollFailedIntegrationTest.php diff --git a/app/Listeners/PollFailedListener.php b/app/Listeners/PollFailedListener.php new file mode 100644 index 0000000..e501ff7 --- /dev/null +++ b/app/Listeners/PollFailedListener.php @@ -0,0 +1,18 @@ +service->recordFailure($event->instance, $event->message); + } +} diff --git a/app/Providers/AppServiceProvider.php b/app/Providers/AppServiceProvider.php index 30eaf8a..dfb03cd 100644 --- a/app/Providers/AppServiceProvider.php +++ b/app/Providers/AppServiceProvider.php @@ -2,10 +2,12 @@ namespace App\Providers; +use App\Listeners\PollFailedListener; use App\Listeners\UrlDiscoveredListener; use App\Services\LanguageDetectionService; use Illuminate\Support\Facades\Event; use Illuminate\Support\ServiceProvider; +use Lvl0\FediDiscover\Events\PollFailed; use Lvl0\FediDiscover\Events\UrlDiscovered; class AppServiceProvider extends ServiceProvider @@ -18,5 +20,6 @@ public function register(): void public function boot(): void { Event::listen(UrlDiscovered::class, UrlDiscoveredListener::class); + Event::listen(PollFailed::class, PollFailedListener::class); } } diff --git a/packages/Lvl0/FediDiscover/src/Console/Commands/PollInstancesCommand.php b/packages/Lvl0/FediDiscover/src/Console/Commands/PollInstancesCommand.php index 41b9604..e92c496 100644 --- a/packages/Lvl0/FediDiscover/src/Console/Commands/PollInstancesCommand.php +++ b/packages/Lvl0/FediDiscover/src/Console/Commands/PollInstancesCommand.php @@ -9,6 +9,7 @@ use Illuminate\Console\Command; use Illuminate\Support\Facades\Log; use Lvl0\FediDiscover\Actions\PollFediverseAction; +use Lvl0\FediDiscover\Events\PollFailed; use Lvl0\FediDiscover\Models\Instance; use Throwable; @@ -24,13 +25,13 @@ public function __construct( public function handle(): int { - $hadFailure = false; - - Instance::enabled() + $errors = Instance::enabled() ->get() - ->each(function (Instance $instance) use (&$hadFailure) { + ->map(function (Instance $instance) { try { $this->action->execute($instance); + + return ['instance_id' => $instance->id, 'status' => 'success']; } catch (Throwable $e) { $this->error("Failed to poll {$instance->url}: {$e->getMessage()}"); Log::warning('fedi-discover:poll failed', [ @@ -39,14 +40,22 @@ public function handle(): int 'exception' => $e::class, 'message' => $e->getMessage(), ]); - $hadFailure = true; - } - }); - if ($hadFailure) { - return self::FAILURE; + return ['instance' => $instance, 'status' => 'error', 'error' => $e->getMessage()]; + } + }) + ->filter(fn (array $res) => $res['status'] === 'error'); + + if ($errors->isEmpty()) { + return self::SUCCESS; } - return self::SUCCESS; + $errors->each(fn (array $errorArr) => PollFailed::dispatch( + $errorArr['instance'], + $errorArr['error'] ?? '', + now()->toImmutable(), + )); + + return self::FAILURE; } } diff --git a/packages/Lvl0/FediDiscover/tests/Feature/PollInstancesCommandTest.php b/packages/Lvl0/FediDiscover/tests/Feature/PollInstancesCommandTest.php index f1797c7..a449552 100644 --- a/packages/Lvl0/FediDiscover/tests/Feature/PollInstancesCommandTest.php +++ b/packages/Lvl0/FediDiscover/tests/Feature/PollInstancesCommandTest.php @@ -5,10 +5,12 @@ namespace Lvl0\FediDiscover\Tests\Feature; use Illuminate\Foundation\Testing\RefreshDatabase; +use Illuminate\Support\Facades\Event; use Lvl0\FediDiscover\Actions\PollFediverseAction; use Lvl0\FediDiscover\Clients\FediverseClientFactory; use Lvl0\FediDiscover\Clients\FediverseClientInterface; use Lvl0\FediDiscover\Config\InstanceType; +use Lvl0\FediDiscover\Events\PollFailed; use Lvl0\FediDiscover\Models\Instance; use Mockery; use RuntimeException; @@ -122,6 +124,52 @@ public function test_one_instance_throwing_does_not_stop_remaining_instances_fro ); } + public function test_poll_failed_event_is_dispatched_when_action_throws(): void + { + Event::fake([PollFailed::class]); + + $instance = Instance::create([ + 'type' => InstanceType::Mastodon, + 'url' => 'https://failing.example', + 'enabled' => true, + 'interval_seconds' => 600, + ]); + + $action = Mockery::mock(PollFediverseAction::class); + $action->shouldReceive('execute') + ->once() + ->andReturnUsing(function (): void { + throw new RuntimeException('Connection refused'); + }); + + $this->app->instance(PollFediverseAction::class, $action); + + $this->artisan('fedi-discover:poll'); + + Event::assertDispatched(PollFailed::class, function (PollFailed $event) use ($instance): bool { + return $event->instance->id === $instance->id + && $event->message === 'Connection refused'; + }); + } + + public function test_poll_failed_event_is_not_dispatched_on_a_successful_poll(): void + { + Event::fake([PollFailed::class]); + + Instance::create([ + 'type' => InstanceType::Mastodon, + 'url' => 'https://healthy.example', + 'enabled' => true, + 'interval_seconds' => 600, + ]); + + // setUp() already binds a no-op action stub via the factory; no override needed. + + $this->artisan('fedi-discover:poll'); + + Event::assertNotDispatched(PollFailed::class); + } + public function test_it_exits_one_when_at_least_one_instance_fails(): void { Instance::create([ diff --git a/tests/Feature/Listeners/PollFailedListenerTest.php b/tests/Feature/Listeners/PollFailedListenerTest.php new file mode 100644 index 0000000..abd706b --- /dev/null +++ b/tests/Feature/Listeners/PollFailedListenerTest.php @@ -0,0 +1,52 @@ +type(InstanceType::Mastodon) + ->enabled() + ->create(['consecutive_poll_failures' => 0]); + + $message = 'connection timed out'; + $failedAt = CarbonImmutable::now(); + $event = new PollFailed($instance, $message, $failedAt); + + $service = Mockery::mock(PollAlertService::class); + $service->shouldReceive('recordFailure') + ->once() + ->with( + Mockery::on(fn (Instance $i) => $i->is($instance)), + $message, + ); + + $listener = new PollFailedListener($service); + $listener->handle($event); + } + + public function test_listener_is_not_queued(): void + { + $this->assertNotInstanceOf( + ShouldQueue::class, + new PollFailedListener($this->createStub(PollAlertService::class)), + ); + } +} diff --git a/tests/Feature/PollFailedIntegrationTest.php b/tests/Feature/PollFailedIntegrationTest.php new file mode 100644 index 0000000..96dd973 --- /dev/null +++ b/tests/Feature/PollFailedIntegrationTest.php @@ -0,0 +1,37 @@ +type(InstanceType::Mastodon) + ->enabled() + ->create(['consecutive_poll_failures' => 0]); + + $this->mock(PollFediverseAction::class) + ->shouldReceive('execute') + ->once() + ->andThrow(new RuntimeException('connection refused')); + + $this->artisan('fedi-discover:poll'); + + $this->assertSame(1, $instance->fresh()->consecutive_poll_failures); + } +} From f80132dfae20192ce87300e04a0155a4545278e1 Mon Sep 17 00:00:00 2001 From: myrmidex Date: Wed, 29 Apr 2026 23:25:36 +0200 Subject: [PATCH 65/65] 6 - Fix ntfy threshold config typing and layout title default --- app/Services/PollAlertService.php | 4 ++-- config/services.php | 2 +- .../src/Console/Commands/PollInstancesCommand.php | 2 +- resources/views/layouts/app.blade.php | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/app/Services/PollAlertService.php b/app/Services/PollAlertService.php index 78ef8bc..44185d1 100644 --- a/app/Services/PollAlertService.php +++ b/app/Services/PollAlertService.php @@ -16,10 +16,10 @@ public function recordFailure(Instance $instance, string $message): void $instance->refresh(); $ntfyUrl = config('services.ntfy.url'); - $ntfyThreshold = (int) config('services.ntfy.threshold'); + $ntfyThreshold = config('services.ntfy.threshold'); $ntfyTopic = config('services.ntfy.topic'); - if ($ntfyUrl === null || $ntfyThreshold === 0 || $ntfyTopic === null) { + if ($ntfyUrl === null || $ntfyThreshold === null || $ntfyThreshold === 0 || $ntfyTopic === null) { return; } diff --git a/config/services.php b/config/services.php index 93fd034..43eb064 100644 --- a/config/services.php +++ b/config/services.php @@ -17,7 +17,7 @@ 'ntfy' => [ 'url' => env('NTFY_URL') ?: null, 'topic' => env('NTFY_TOPIC') ?: null, - 'threshold' => env('NTFY_THRESHOLD'), + 'threshold' => env('NTFY_THRESHOLD') !== null ? (int) env('NTFY_THRESHOLD') : null, ], 'postmark' => [ diff --git a/packages/Lvl0/FediDiscover/src/Console/Commands/PollInstancesCommand.php b/packages/Lvl0/FediDiscover/src/Console/Commands/PollInstancesCommand.php index e92c496..031049e 100644 --- a/packages/Lvl0/FediDiscover/src/Console/Commands/PollInstancesCommand.php +++ b/packages/Lvl0/FediDiscover/src/Console/Commands/PollInstancesCommand.php @@ -52,7 +52,7 @@ public function handle(): int $errors->each(fn (array $errorArr) => PollFailed::dispatch( $errorArr['instance'], - $errorArr['error'] ?? '', + $errorArr['error'], now()->toImmutable(), )); diff --git a/resources/views/layouts/app.blade.php b/resources/views/layouts/app.blade.php index 8e6f01a..97a6501 100644 --- a/resources/views/layouts/app.blade.php +++ b/resources/views/layouts/app.blade.php @@ -4,7 +4,7 @@ - Trove @yield('title') + Trove @yield('title', config('app.name')) @vite(['resources/css/app.css', 'resources/js/app.js'])