Compare commits

..

No commits in common. "264180cd369d12a56792168266de81787d224f92" and "6b610b699eee191b0e3bfe529a1c51db8fde6970" have entirely different histories.

45 changed files with 56 additions and 3010 deletions

View file

@ -61,5 +61,3 @@ AWS_BUCKET=
AWS_USE_PATH_STYLE_ENDPOINT=false
VITE_APP_NAME="${APP_NAME}"
CRAWLER_MIN_DOMAIN_DELAY_SECONDS=10

View file

@ -36,7 +36,6 @@ ### Required environment
### Services you need to provide
- **App**: pull `forge.lvl0.xyz/lvl0/trove:latest` (or a pinned `v*` tag). Exposes port `8000` inside the container. The image runs migrations and warms caches on boot.
- **Worker**: same image as `app`, with `command: php artisan queue:work --tries=3 --max-time=3600`. Processes the crawler queue (URL fetching, content extraction, retries). Crawls won't actually run without this — `app` only enqueues work. **Required for the crawler to function.**
- **PostgreSQL 17**. Hostname must be reachable as `db` (default) or set `DB_HOST`. Persist `/var/lib/postgresql/data`.
- **Redis 7** with `--appendonly yes` (queue jobs persist across restarts). Hostname `redis` or set `REDIS_HOST`.
@ -72,22 +71,6 @@ ### Example compose stack
db: { condition: service_healthy }
redis: { condition: service_healthy }
worker:
image: forge.lvl0.xyz/lvl0/trove:latest
restart: always
command: php artisan queue:work --tries=3 --max-time=3600
environment:
APP_KEY: "${APP_KEY}"
APP_URL: "${APP_URL}"
DB_DATABASE: "${DB_DATABASE}"
DB_USERNAME: "${DB_USERNAME}"
DB_PASSWORD: "${DB_PASSWORD}"
volumes:
- app_storage:/app/storage
depends_on:
db: { condition: service_healthy }
redis: { condition: service_healthy }
db:
image: postgres:17-alpine
restart: always

View file

@ -1,162 +0,0 @@
<?php
declare(strict_types=1);
namespace App\Actions;
use App\Enums\CrawlOutcomeEnum;
use App\Services\UrlService;
use App\ValueObjects\FetchResult;
use fivefilters\Readability\Configuration;
use fivefilters\Readability\Readability;
use GuzzleHttp\Exception\ConnectException;
use Illuminate\Http\Client\ConnectionException;
use Illuminate\Http\Client\Factory;
use Illuminate\Http\Client\Response;
use InvalidArgumentException;
use League\Uri\BaseUri;
use Symfony\Component\DomCrawler\Crawler;
use Throwable;
class FetchPageAction
{
public function __construct(
private Factory $http,
private UrlService $urlService,
) {}
public function __invoke(string $url): FetchResult
{
try {
$response = $this->http
->timeout(config('crawler.timeout'))
->withHeaders([
'User-Agent' => config('crawler.user_agent'),
'Accept' => 'text/html',
])
->withOptions([
'allow_redirects' => ['max' => config('crawler.max_redirects')],
])
->get($url);
} catch (ConnectionException|ConnectException $e) {
return $this->failureResult($e);
}
[$outcome, $error] = $this->validateResponse($response);
if ($outcome === CrawlOutcomeEnum::Success) {
[$title, $extractedText, $links] = $this->extractTitleTextAndLinks($response->body(), $url);
$wordCount = $extractedText !== '' ? count(preg_split('/\s+/u', trim($extractedText))) : 0;
}
return new FetchResult(
outcome: $outcome,
statusCode: $response->status(),
finalUrl: $url,
title: $title ?? null,
extractedText: $extractedText ?? null,
outboundLinks: $links ?? collect(),
wordCount: $wordCount ?? null,
errorMessage: $error ?? null,
);
}
private function validateResponse(Response $response): array
{
$status = $response->status();
if ($status >= 400 && $status < 500) {
return [CrawlOutcomeEnum::Blocked4xx, "HTTP {$status}"];
}
if ($status >= 500) {
return [CrawlOutcomeEnum::Blocked5xx, "HTTP {$status}"];
}
$contentType = $response->header('Content-Type');
if (! str_starts_with(mb_strtolower($contentType), 'text/html')) {
return [CrawlOutcomeEnum::Rejected, "Unsupported file type: {$contentType}"];
}
return [CrawlOutcomeEnum::Success, null];
}
private function failureResult(ConnectionException|ConnectException $e): FetchResult
{
$guzzleException = $e instanceof ConnectException
? $e
: ($e->getPrevious() instanceof ConnectException
? $e->getPrevious()
: null);
$errno = $guzzleException?->getHandlerContext()['errno'] ?? null;
$outcome = $errno === CURLE_OPERATION_TIMEDOUT
? CrawlOutcomeEnum::Timeout
: CrawlOutcomeEnum::Failed;
return new FetchResult(
outcome: $outcome,
statusCode: null,
finalUrl: null,
title: null,
extractedText: null,
outboundLinks: collect(),
wordCount: null,
errorMessage: $e->getMessage(),
);
}
private function extractTitleTextAndLinks(string $body, string $url): array
{
$crawler = new Crawler($body);
$title = $crawler->filter('title')->count() > 0
? trim($crawler->filter('title')->text())
: null;
$readability = new Readability(new Configuration);
$readability->parse($body);
$mainContent = $readability->getContent() ?? '';
$extractedText = trim(strip_tags($mainContent));
$links = collect();
if ($mainContent !== '') {
$linkCrawler = new Crawler($mainContent);
if ($linkCrawler->filter('a[href]')->count() > 0) {
$links = collect($linkCrawler->filter('a[href]')->extract(['href']));
}
}
$linksResolved = $links
->map(fn (string $href) => $this->resolveAndValidateLink($href, $url))
->filter()
->unique()
->values();
return [$title, $extractedText, $linksResolved];
}
private function resolveAndValidateLink(string $href, string $finalUrl): ?string
{
try {
$resolved = (string) BaseUri::from($finalUrl)->resolve($href);
$resolved = strstr($resolved, '#', true) ?: $resolved;
} catch (Throwable) {
return null;
}
if ($resolved === $finalUrl) {
return null;
}
try {
$this->urlService->host($resolved);
} catch (InvalidArgumentException) {
return null;
}
return $resolved;
}
}

View file

@ -1,22 +0,0 @@
<?php
declare(strict_types=1);
namespace App\Actions;
use App\Enums\PageStatusEnum;
use App\Models\Page;
class RegisterDiscoveredPageAction
{
public function __invoke(string $url, ?int $instanceId = null): Page
{
return Page::firstOrCreate(
['url' => $url],
[
'status' => PageStatusEnum::Discovered,
'instance_id' => $instanceId,
],
);
}
}

View file

@ -1,60 +0,0 @@
<?php
declare(strict_types=1);
namespace App\Enums;
enum CrawlOutcomeEnum: string
{
case Success = 'success';
case Failed = 'failed';
case Timeout = 'timeout';
case BlockedRobots = 'blocked_robots';
case Blocked4xx = 'blocked_4xx';
case Blocked5xx = 'blocked_5xx';
/**
* The HTTP fetch succeeded (2xx) but the response is unindexable in v0.1
* (non-HTML Content-Type). Worker MUST also write `pages.status = Rejected`
* on this outcome do NOT treat as Failed. Page row STAYS in the DB to
* prevent re-discovery loops as fediverse re-shares the URL.
*/
case Rejected = 'rejected';
/**
* The PageStatusEnum value the parent `pages` row should land on for this outcome.
*/
public function toPageStatus(): PageStatusEnum
{
return match ($this) {
self::Success => PageStatusEnum::Fetched,
self::Rejected => PageStatusEnum::Rejected,
self::Failed,
self::Timeout,
self::BlockedRobots,
self::Blocked4xx,
self::Blocked5xx => PageStatusEnum::Failed,
};
}
/**
* True if the worker should retry this outcome (transient failures only).
* Permanent failures (4xx, robots block, rejected content type) and successes do not retry.
*/
public function isRetryable(): bool
{
return match ($this) {
self::Failed, self::Timeout, self::Blocked5xx => true,
self::Success, self::Rejected, self::BlockedRobots, self::Blocked4xx => false,
};
}
/**
* True if the worker should register the outbound links discovered during the fetch.
* Only Success outcomes have meaningful links; everything else either failed or returned no usable HTML.
*/
public function shouldRegisterOutboundLinks(): bool
{
return $this === self::Success;
}
}

View file

@ -9,12 +9,4 @@ enum PageStatusEnum: string
case Discovered = 'discovered';
case Fetched = 'fetched';
case Failed = 'failed';
/**
* The crawler fetched the page but rejected it as unindexable in v0.1
* (non-HTML Content-Type). Page row stays as a sentinel preventing
* re-discovery loops; future re-crawl could flip status back to
* Discovered Fetched if the URL starts serving HTML.
*/
case Rejected = 'rejected';
}

View file

@ -1,105 +0,0 @@
<?php
declare(strict_types=1);
namespace App\Jobs;
use App\Actions\FetchPageAction;
use App\Actions\RegisterDiscoveredPageAction;
use App\Enums\PageStatusEnum;
use App\Models\PageCrawl;
use App\Services\PolitenessService;
use App\ValueObjects\FetchResult;
use Illuminate\Contracts\Queue\ShouldQueue;
use Illuminate\Foundation\Queue\Queueable;
use Illuminate\Support\Facades\Cache;
class ProcessCrawlJob implements ShouldQueue
{
use Queueable;
public function __construct(
public PageCrawl $pageCrawl,
) {}
public function handle(): void
{
$fetcher = resolve(FetchPageAction::class);
$register = resolve(RegisterDiscoveredPageAction::class);
$politenessService = resolve(PolitenessService::class);
$delay = $politenessService->minDelayFor($this->pageCrawl->domain);
$lock = Cache::lock("crawler:domain:{$this->pageCrawl->domain}", $delay);
if (! $lock->get()) {
$this->release($delay);
return;
}
$result = $fetcher($this->pageCrawl->page->url);
$this->writeOutcome($result);
$this->updatePageStatus($result);
if ($result->outcome->shouldRegisterOutboundLinks()) {
$result->outboundLinks->each(fn (string $url) => $register($url));
}
if ($result->outcome->isRetryable()) {
$this->scheduleRetryIfNeeded();
}
}
private function writeOutcome(FetchResult $result): void
{
$this->pageCrawl->update([
'outcome' => $result->outcome,
'completed_at' => now(),
'status_code' => $result->statusCode,
'error_message' => $result->errorMessage,
]);
}
private function updatePageStatus(FetchResult $result): void
{
$status = $result->outcome->toPageStatus();
$update = match ($status) {
PageStatusEnum::Fetched => [
'status' => $status,
'fetched_at' => now(),
'title' => $result->title,
],
PageStatusEnum::Failed => [
'status' => $status,
'failed_at' => now(),
],
PageStatusEnum::Rejected => [
'status' => $status,
],
PageStatusEnum::Discovered => [
'status' => $status,
],
};
$this->pageCrawl->page->update($update);
}
private function scheduleRetryIfNeeded(): void
{
if (PageCrawl::where('page_id', $this->pageCrawl->page_id)->count() >= 3) {
return;
}
$newRow = PageCrawl::withoutEvents(
fn () => PageCrawl::create(
array_merge($this->pageCrawl->toArray(), [
'outcome' => null,
])
)
);
ProcessCrawlJob::dispatch($newRow)->delay(now()->addHour());
}
}

View file

@ -4,7 +4,8 @@
namespace App\Listeners;
use App\Actions\RegisterDiscoveredPageAction;
use App\Enums\PageStatusEnum;
use App\Models\Page;
use App\Models\PageLink;
use Illuminate\Contracts\Queue\ShouldQueue;
use Illuminate\Support\Facades\DB;
@ -12,20 +13,22 @@
class UrlDiscoveredListener implements ShouldQueue
{
public function __construct(
private RegisterDiscoveredPageAction $registerPage,
) {}
public function handle(UrlDiscovered $event): void
{
DB::transaction(function () use ($event) {
$targetPage = ($this->registerPage)($event->url, $event->instanceId);
$targetPage = Page::firstOrCreate(
['url' => $event->url],
['status' => PageStatusEnum::Discovered, 'instance_id' => $event->instanceId],
);
if ($event->postUrl === null || $event->postUrl === $event->url) {
return;
}
$sourcePage = ($this->registerPage)($event->postUrl, $event->instanceId);
$sourcePage = Page::firstOrCreate(
['url' => $event->postUrl],
['status' => PageStatusEnum::Discovered, 'instance_id' => $event->instanceId],
);
PageLink::firstOrCreate([
'source_page_id' => $sourcePage->id,

View file

@ -1,44 +0,0 @@
<?php
declare(strict_types=1);
namespace App\Livewire;
use App\Actions\RegisterDiscoveredPageAction;
use Illuminate\Contracts\View\View;
use Illuminate\Support\Facades\RateLimiter;
use Livewire\Component;
class UrlSubmissionForm extends Component
{
public string $url = '';
public ?string $confirmedUrl = null;
public function submit(RegisterDiscoveredPageAction $registerPage): void
{
$key = 'submit-url:' . request()->ip();
if (RateLimiter::tooManyAttempts($key, 10)) {
$this->addError('rate_limit', 'Too many submissions, try again shortly.');
return;
}
RateLimiter::hit($key, 60);
$validated = $this->validate([
'url' => ['required', 'url:http,https'],
]);
$registerPage($validated['url']);
$this->confirmedUrl = $validated['url'];
$this->reset('url');
}
public function render(): View
{
return view('livewire.url-submission-form');
}
}

View file

@ -5,17 +5,13 @@
namespace App\Models;
use App\Enums\PageStatusEnum;
use App\Observers\PageObserver;
use Database\Factories\PageFactory;
use Illuminate\Database\Eloquent\Attributes\ObservedBy;
use Illuminate\Database\Eloquent\Factories\HasFactory;
use Illuminate\Database\Eloquent\Model;
use Illuminate\Database\Eloquent\Relations\BelongsTo;
use Illuminate\Database\Eloquent\Relations\HasMany;
use Illuminate\Database\Eloquent\Relations\HasOne;
use Lvl0\FediDiscover\Models\Instance;
#[ObservedBy([PageObserver::class])]
class Page extends Model
{
/** @use HasFactory<PageFactory> */
@ -24,7 +20,6 @@ class Page extends Model
protected $fillable = [
'url',
'status',
'language',
'title',
'instance_id',
'posted_at',
@ -53,14 +48,4 @@ public function incomingLinks(): HasMany
{
return $this->hasMany(PageLink::class, 'target_page_id');
}
public function crawls(): HasMany
{
return $this->hasMany(PageCrawl::class);
}
public function latestCrawl(): HasOne
{
return $this->hasOne(PageCrawl::class)->latestOfMany('created_at');
}
}

View file

@ -1,45 +0,0 @@
<?php
declare(strict_types=1);
namespace App\Models;
use App\Enums\CrawlOutcomeEnum;
use App\Observers\PageCrawlObserver;
use Database\Factories\PageCrawlFactory;
use Illuminate\Database\Eloquent\Attributes\ObservedBy;
use Illuminate\Database\Eloquent\Factories\HasFactory;
use Illuminate\Database\Eloquent\Model;
use Illuminate\Database\Eloquent\Relations\BelongsTo;
#[ObservedBy(PageCrawlObserver::class)]
class PageCrawl extends Model
{
/** @use HasFactory<PageCrawlFactory> */
use HasFactory;
protected $fillable = [
'page_id',
'domain',
'priority',
'completed_at',
'outcome',
'status_code',
'error_message',
];
protected $casts = [
'priority' => 'integer',
'completed_at' => 'datetime',
'outcome' => CrawlOutcomeEnum::class,
'status_code' => 'integer',
];
/**
* @return BelongsTo<Page, $this>
*/
public function page(): BelongsTo
{
return $this->belongsTo(Page::class);
}
}

View file

@ -1,14 +0,0 @@
<?php
namespace App\Observers;
use App\Jobs\ProcessCrawlJob;
use App\Models\PageCrawl;
class PageCrawlObserver
{
public function created(PageCrawl $pageCrawl): void
{
ProcessCrawlJob::dispatch($pageCrawl);
}
}

View file

@ -1,25 +0,0 @@
<?php
declare(strict_types=1);
namespace App\Observers;
use App\Models\Page;
use App\Models\PageCrawl;
use App\Services\UrlService;
class PageObserver
{
public function __construct(private UrlService $urlService) {}
public function created(Page $page): void
{
PageCrawl::firstOrCreate(
['page_id' => $page->id],
[
'domain' => $this->urlService->host($page->url),
'priority' => 0,
],
);
}
}

View file

@ -1,19 +0,0 @@
<?php
declare(strict_types=1);
namespace App\Services;
class PolitenessService
{
public function minDelayFor(string $domain): int
{
$configValue = config('crawler.min_domain_delay_seconds');
if ($configValue !== null) {
return $configValue;
}
return 10;
}
}

View file

@ -1,40 +0,0 @@
<?php
declare(strict_types=1);
namespace App\Services;
use Illuminate\Support\Uri;
use InvalidArgumentException;
class UrlService
{
public function host(string $url): string
{
$uri = Uri::of($url);
$scheme = $uri->scheme();
if ($scheme === null || $scheme === '') {
throw new InvalidArgumentException("URL has no scheme: {$url}");
}
if (! in_array($scheme, ['http', 'https'], true)) {
throw new InvalidArgumentException("Invalid URL scheme: {$scheme}");
}
if ($uri->user() !== null) {
throw new InvalidArgumentException("URLs with embedded credentials not allowed: {$url}");
}
$host = $uri->host();
if ($host === null || $host === '') {
throw new InvalidArgumentException("URL has no host: {$url}");
}
$bareHost = preg_replace('/%.*$/', '', trim($host, '[]'));
if (filter_var($bareHost, FILTER_VALIDATE_IP) !== false) {
throw new InvalidArgumentException("IP literal hosts not allowed: {$host}");
}
return mb_strtolower($host);
}
}

View file

@ -1,26 +0,0 @@
<?php
declare(strict_types=1);
namespace App\ValueObjects;
use App\Enums\CrawlOutcomeEnum;
use Illuminate\Support\Collection;
final readonly class FetchResult
{
/**
* @param ?string $finalUrl Set to the request URL in v0.1; true post-redirect URL tracking is deferred (see ticket #12 spec). Downstream consumers MUST NOT trust this field as the post-redirect location until that lands.
* @param Collection<int, string> $outboundLinks
*/
public function __construct(
public CrawlOutcomeEnum $outcome,
public ?int $statusCode,
public ?string $finalUrl,
public ?string $title,
public ?string $extractedText,
public Collection $outboundLinks,
public ?int $wordCount,
public ?string $errorMessage,
) {}
}

View file

@ -3,7 +3,6 @@
use Illuminate\Foundation\Application;
use Illuminate\Foundation\Configuration\Exceptions;
use Illuminate\Foundation\Configuration\Middleware;
use Illuminate\Http\Request;
return Application::configure(basePath: dirname(__DIR__))
->withRouting(
@ -12,11 +11,7 @@
health: '/up',
)
->withMiddleware(function (Middleware $middleware): void {
$middleware->trustProxies(
at: '*',
headers: Request::HEADER_X_FORWARDED_FOR
| Request::HEADER_X_FORWARDED_PROTO,
);
//
})
->withExceptions(function (Exceptions $exceptions): void {
//

View file

@ -16,12 +16,10 @@
],
"require": {
"php": "^8.3",
"fivefilters/readability.php": "^3.3",
"laravel/framework": "^13.0",
"laravel/tinker": "^3.0",
"livewire/livewire": "^4.2",
"lvl0/fedi-discover": "@dev",
"symfony/dom-crawler": "^7.4"
"lvl0/fedi-discover": "@dev"
},
"require-dev": {
"fakerphp/faker": "^1.23",

276
composer.lock generated
View file

@ -4,7 +4,7 @@
"Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies",
"This file is @generated automatically"
],
"content-hash": "2c63ed546b17b144997244f805e8a94a",
"content-hash": "e46e58784ec34415557c78db6bb6c97e",
"packages": [
{
"name": "brick/math",
@ -508,71 +508,6 @@
],
"time": "2025-03-06T22:45:56+00:00"
},
{
"name": "fivefilters/readability.php",
"version": "v3.3.3",
"source": {
"type": "git",
"url": "https://github.com/fivefilters/readability.php.git",
"reference": "e2ee7b9e49eae89ac7ed2c74b15718100a73b4c8"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/fivefilters/readability.php/zipball/e2ee7b9e49eae89ac7ed2c74b15718100a73b4c8",
"reference": "e2ee7b9e49eae89ac7ed2c74b15718100a73b4c8",
"shasum": ""
},
"require": {
"ext-dom": "*",
"ext-mbstring": "*",
"ext-xml": "*",
"league/uri": "^7.0",
"masterminds/html5": "^2.0",
"php": ">=8.1",
"psr/log": "^1.0 || ^2.0 || ^3.0"
},
"require-dev": {
"monolog/monolog": "^3.0",
"phpunit/phpunit": "^10.0 || ^11.0"
},
"suggest": {
"monolog/monolog": "Allow logging debug information"
},
"type": "library",
"autoload": {
"psr-4": {
"fivefilters\\Readability\\": "src/"
}
},
"notification-url": "https://packagist.org/downloads/",
"license": [
"Apache-2.0"
],
"authors": [
{
"name": "Andres Rey",
"email": "andreskrey@gmail.com",
"role": "Original Developer"
},
{
"name": "Keyvan Minoukadeh",
"email": "keyvan@fivefilters.org",
"homepage": "https://www.fivefilters.org",
"role": "Developer/Maintainer"
}
],
"description": "A PHP port of Readability.js",
"homepage": "https://github.com/fivefilters/readability.php",
"keywords": [
"html",
"readability"
],
"support": {
"issues": "https://github.com/fivefilters/readability.php/issues",
"source": "https://github.com/fivefilters/readability.php/tree/v3.3.3"
},
"time": "2025-04-26T23:45:37+00:00"
},
{
"name": "fruitcake/php-cors",
"version": "v1.4.0",
@ -2167,7 +2102,7 @@
},
{
"name": "lvl0/fedi-discover",
"version": "dev-release/0.1.0",
"version": "dev-main",
"dist": {
"type": "path",
"url": "packages/Lvl0/FediDiscover",
@ -2207,73 +2142,6 @@
"relative": true
}
},
{
"name": "masterminds/html5",
"version": "2.10.0",
"source": {
"type": "git",
"url": "https://github.com/Masterminds/html5-php.git",
"reference": "fcf91eb64359852f00d921887b219479b4f21251"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/Masterminds/html5-php/zipball/fcf91eb64359852f00d921887b219479b4f21251",
"reference": "fcf91eb64359852f00d921887b219479b4f21251",
"shasum": ""
},
"require": {
"ext-dom": "*",
"php": ">=5.3.0"
},
"require-dev": {
"phpunit/phpunit": "^4.8.35 || ^5.7.21 || ^6 || ^7 || ^8 || ^9"
},
"type": "library",
"extra": {
"branch-alias": {
"dev-master": "2.7-dev"
}
},
"autoload": {
"psr-4": {
"Masterminds\\": "src"
}
},
"notification-url": "https://packagist.org/downloads/",
"license": [
"MIT"
],
"authors": [
{
"name": "Matt Butcher",
"email": "technosophos@gmail.com"
},
{
"name": "Matt Farina",
"email": "matt@mattfarina.com"
},
{
"name": "Asmir Mustafic",
"email": "goetas@gmail.com"
}
],
"description": "An HTML5 parser and serializer.",
"homepage": "http://masterminds.github.io/html5-php",
"keywords": [
"HTML5",
"dom",
"html",
"parser",
"querypath",
"serializer",
"xml"
],
"support": {
"issues": "https://github.com/Masterminds/html5-php/issues",
"source": "https://github.com/Masterminds/html5-php/tree/2.10.0"
},
"time": "2025-07-25T09:04:22+00:00"
},
{
"name": "monolog/monolog",
"version": "3.10.0",
@ -3861,78 +3729,6 @@
],
"time": "2024-09-25T14:21:43+00:00"
},
{
"name": "symfony/dom-crawler",
"version": "v7.4.8",
"source": {
"type": "git",
"url": "https://github.com/symfony/dom-crawler.git",
"reference": "2918e7c2ba964defca1f5b69c6f74886529e2dc8"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/symfony/dom-crawler/zipball/2918e7c2ba964defca1f5b69c6f74886529e2dc8",
"reference": "2918e7c2ba964defca1f5b69c6f74886529e2dc8",
"shasum": ""
},
"require": {
"masterminds/html5": "^2.6",
"php": ">=8.2",
"symfony/deprecation-contracts": "^2.5|^3",
"symfony/polyfill-ctype": "~1.8",
"symfony/polyfill-mbstring": "~1.0"
},
"require-dev": {
"symfony/css-selector": "^6.4|^7.0|^8.0"
},
"type": "library",
"autoload": {
"psr-4": {
"Symfony\\Component\\DomCrawler\\": ""
},
"exclude-from-classmap": [
"/Tests/"
]
},
"notification-url": "https://packagist.org/downloads/",
"license": [
"MIT"
],
"authors": [
{
"name": "Fabien Potencier",
"email": "fabien@symfony.com"
},
{
"name": "Symfony Community",
"homepage": "https://symfony.com/contributors"
}
],
"description": "Eases DOM navigation for HTML and XML documents",
"homepage": "https://symfony.com",
"support": {
"source": "https://github.com/symfony/dom-crawler/tree/v7.4.8"
},
"funding": [
{
"url": "https://symfony.com/sponsor",
"type": "custom"
},
{
"url": "https://github.com/fabpot",
"type": "github"
},
{
"url": "https://github.com/nicolas-grekas",
"type": "github"
},
{
"url": "https://tidelift.com/funding/github/packagist/symfony/symfony",
"type": "tidelift"
}
],
"time": "2026-03-24T13:12:05+00:00"
},
{
"name": "symfony/error-handler",
"version": "v7.4.8",
@ -4620,7 +4416,7 @@
},
{
"name": "symfony/polyfill-ctype",
"version": "v1.37.0",
"version": "v1.36.0",
"source": {
"type": "git",
"url": "https://github.com/symfony/polyfill-ctype.git",
@ -4679,7 +4475,7 @@
"portable"
],
"support": {
"source": "https://github.com/symfony/polyfill-ctype/tree/v1.37.0"
"source": "https://github.com/symfony/polyfill-ctype/tree/v1.36.0"
},
"funding": [
{
@ -4703,16 +4499,16 @@
},
{
"name": "symfony/polyfill-intl-grapheme",
"version": "v1.37.0",
"version": "v1.36.0",
"source": {
"type": "git",
"url": "https://github.com/symfony/polyfill-intl-grapheme.git",
"reference": "4864388bfbd3001ce88e234fab652acd91fdc57e"
"reference": "ad1b7b9092976d6c948b8a187cec9faaea9ec1df"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/symfony/polyfill-intl-grapheme/zipball/4864388bfbd3001ce88e234fab652acd91fdc57e",
"reference": "4864388bfbd3001ce88e234fab652acd91fdc57e",
"url": "https://api.github.com/repos/symfony/polyfill-intl-grapheme/zipball/ad1b7b9092976d6c948b8a187cec9faaea9ec1df",
"reference": "ad1b7b9092976d6c948b8a187cec9faaea9ec1df",
"shasum": ""
},
"require": {
@ -4761,7 +4557,7 @@
"shim"
],
"support": {
"source": "https://github.com/symfony/polyfill-intl-grapheme/tree/v1.37.0"
"source": "https://github.com/symfony/polyfill-intl-grapheme/tree/v1.36.0"
},
"funding": [
{
@ -4781,11 +4577,11 @@
"type": "tidelift"
}
],
"time": "2026-04-26T13:13:48+00:00"
"time": "2026-04-10T16:19:22+00:00"
},
{
"name": "symfony/polyfill-intl-idn",
"version": "v1.37.0",
"version": "v1.36.0",
"source": {
"type": "git",
"url": "https://github.com/symfony/polyfill-intl-idn.git",
@ -4848,7 +4644,7 @@
"shim"
],
"support": {
"source": "https://github.com/symfony/polyfill-intl-idn/tree/v1.37.0"
"source": "https://github.com/symfony/polyfill-intl-idn/tree/v1.36.0"
},
"funding": [
{
@ -4872,7 +4668,7 @@
},
{
"name": "symfony/polyfill-intl-normalizer",
"version": "v1.37.0",
"version": "v1.36.0",
"source": {
"type": "git",
"url": "https://github.com/symfony/polyfill-intl-normalizer.git",
@ -4933,7 +4729,7 @@
"shim"
],
"support": {
"source": "https://github.com/symfony/polyfill-intl-normalizer/tree/v1.37.0"
"source": "https://github.com/symfony/polyfill-intl-normalizer/tree/v1.36.0"
},
"funding": [
{
@ -4957,7 +4753,7 @@
},
{
"name": "symfony/polyfill-mbstring",
"version": "v1.37.0",
"version": "v1.36.0",
"source": {
"type": "git",
"url": "https://github.com/symfony/polyfill-mbstring.git",
@ -5018,7 +4814,7 @@
"shim"
],
"support": {
"source": "https://github.com/symfony/polyfill-mbstring/tree/v1.37.0"
"source": "https://github.com/symfony/polyfill-mbstring/tree/v1.36.0"
},
"funding": [
{
@ -5042,7 +4838,7 @@
},
{
"name": "symfony/polyfill-php80",
"version": "v1.37.0",
"version": "v1.36.0",
"source": {
"type": "git",
"url": "https://github.com/symfony/polyfill-php80.git",
@ -5102,7 +4898,7 @@
"shim"
],
"support": {
"source": "https://github.com/symfony/polyfill-php80/tree/v1.37.0"
"source": "https://github.com/symfony/polyfill-php80/tree/v1.36.0"
},
"funding": [
{
@ -5126,7 +4922,7 @@
},
{
"name": "symfony/polyfill-php83",
"version": "v1.37.0",
"version": "v1.36.0",
"source": {
"type": "git",
"url": "https://github.com/symfony/polyfill-php83.git",
@ -5182,7 +4978,7 @@
"shim"
],
"support": {
"source": "https://github.com/symfony/polyfill-php83/tree/v1.37.0"
"source": "https://github.com/symfony/polyfill-php83/tree/v1.36.0"
},
"funding": [
{
@ -5206,7 +5002,7 @@
},
{
"name": "symfony/polyfill-php84",
"version": "v1.37.0",
"version": "v1.36.0",
"source": {
"type": "git",
"url": "https://github.com/symfony/polyfill-php84.git",
@ -5262,7 +5058,7 @@
"shim"
],
"support": {
"source": "https://github.com/symfony/polyfill-php84/tree/v1.37.0"
"source": "https://github.com/symfony/polyfill-php84/tree/v1.36.0"
},
"funding": [
{
@ -5286,16 +5082,16 @@
},
{
"name": "symfony/polyfill-php85",
"version": "v1.37.0",
"version": "v1.36.0",
"source": {
"type": "git",
"url": "https://github.com/symfony/polyfill-php85.git",
"reference": "fcfa4973a9917cef23f2e38774da74a2b7d115ee"
"reference": "2c408a6bb0313e6001a83628dc5506100474254e"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/symfony/polyfill-php85/zipball/fcfa4973a9917cef23f2e38774da74a2b7d115ee",
"reference": "fcfa4973a9917cef23f2e38774da74a2b7d115ee",
"url": "https://api.github.com/repos/symfony/polyfill-php85/zipball/2c408a6bb0313e6001a83628dc5506100474254e",
"reference": "2c408a6bb0313e6001a83628dc5506100474254e",
"shasum": ""
},
"require": {
@ -5342,7 +5138,7 @@
"shim"
],
"support": {
"source": "https://github.com/symfony/polyfill-php85/tree/v1.37.0"
"source": "https://github.com/symfony/polyfill-php85/tree/v1.36.0"
},
"funding": [
{
@ -5362,11 +5158,11 @@
"type": "tidelift"
}
],
"time": "2026-04-26T13:10:57+00:00"
"time": "2026-04-10T16:50:15+00:00"
},
{
"name": "symfony/polyfill-uuid",
"version": "v1.37.0",
"version": "v1.36.0",
"source": {
"type": "git",
"url": "https://github.com/symfony/polyfill-uuid.git",
@ -5425,7 +5221,7 @@
"uuid"
],
"support": {
"source": "https://github.com/symfony/polyfill-uuid/tree/v1.37.0"
"source": "https://github.com/symfony/polyfill-uuid/tree/v1.36.0"
},
"funding": [
{
@ -6263,16 +6059,16 @@
},
{
"name": "voku/portable-ascii",
"version": "2.1.1",
"version": "2.1.0",
"source": {
"type": "git",
"url": "https://github.com/voku/portable-ascii.git",
"reference": "8e1051fe39379367aecf014f41744ce7539a856f"
"reference": "d870a33f0f79d2b4579740b0620200221ee44aeb"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/voku/portable-ascii/zipball/8e1051fe39379367aecf014f41744ce7539a856f",
"reference": "8e1051fe39379367aecf014f41744ce7539a856f",
"url": "https://api.github.com/repos/voku/portable-ascii/zipball/d870a33f0f79d2b4579740b0620200221ee44aeb",
"reference": "d870a33f0f79d2b4579740b0620200221ee44aeb",
"shasum": ""
},
"require": {
@ -6309,7 +6105,7 @@
],
"support": {
"issues": "https://github.com/voku/portable-ascii/issues",
"source": "https://github.com/voku/portable-ascii/tree/2.1.1"
"source": "https://github.com/voku/portable-ascii/tree/2.1.0"
},
"funding": [
{
@ -6333,7 +6129,7 @@
"type": "tidelift"
}
],
"time": "2026-04-26T05:33:54+00:00"
"time": "2026-04-16T23:10:39+00:00"
}
],
"packages-dev": [

View file

@ -1,46 +0,0 @@
<?php
declare(strict_types=1);
return [
/*
|---------------------------------------------------------------------------
| HTTP timeout (seconds)
|---------------------------------------------------------------------------
|
| Hard cap on a single fetch. Guzzle's default is 0 (wait forever) never
| acceptable for a crawler. Tune up cautiously; longer timeouts amplify the
| impact of slow targets on overall throughput.
|
*/
'timeout' => env('CRAWLER_TIMEOUT', 10),
/*
|---------------------------------------------------------------------------
| Maximum redirects to follow
|---------------------------------------------------------------------------
|
| Guzzle default is 5. Cross-origin redirects are accepted in v0.1 the
| search engine treats the post-redirect URL as the canonical one for
| indexing.
|
*/
'max_redirects' => env('CRAWLER_MAX_REDIRECTS', 5),
/*
|---------------------------------------------------------------------------
| User-Agent
|---------------------------------------------------------------------------
|
| Identifies our crawler to target servers. The placeholder below is for
| v0.1 development; ticket #10 replaces it with the production identity
| and adds a `/bot` info page that the URL points at.
|
*/
'user_agent' => env('CRAWLER_USER_AGENT', 'TroveBot/0.1 (+https://trove.lvl0.xyz/bot)'),
'min_domain_delay_seconds' => env('CRAWLER_MIN_DOMAIN_DELAY_SECONDS', 10),
];

View file

@ -1,282 +0,0 @@
<?php
return [
/*
|---------------------------------------------------------------------------
| Component Locations
|---------------------------------------------------------------------------
|
| This value sets the root directories that'll be used to resolve view-based
| components like single and multi-file components. The make command will
| use the first directory in this array to add new component files to.
|
*/
'component_locations' => [
resource_path('views/components'),
resource_path('views/livewire'),
],
/*
|---------------------------------------------------------------------------
| Component Namespaces
|---------------------------------------------------------------------------
|
| This value sets default namespaces that will be used to resolve view-based
| components like single-file and multi-file components. These folders'll
| also be referenced when creating new components via the make command.
|
*/
'component_namespaces' => [
'layouts' => resource_path('views/layouts'),
'pages' => resource_path('views/pages'),
],
/*
|---------------------------------------------------------------------------
| Page Layout
|---------------------------------------------------------------------------
| The view that will be used as the layout when rendering a single component as
| an entire page via `Route::livewire('/post/create', 'pages::create-post')`.
| In this case, the content of pages::create-post will render into $slot.
|
*/
'component_layout' => 'layouts::app',
/*
|---------------------------------------------------------------------------
| Lazy Loading Placeholder
|---------------------------------------------------------------------------
| Livewire allows you to lazy load components that would otherwise slow down
| the initial page load. Every component can have a custom placeholder or
| you can define the default placeholder view for all components below.
|
*/
'component_placeholder' => null, // Example: 'placeholders::skeleton'
/*
|---------------------------------------------------------------------------
| Make Command
|---------------------------------------------------------------------------
| This value determines the default configuration for the artisan make command
| You can configure the component type (sfc, mfc, class) and whether to use
| the high-voltage () emoji as a prefix in the sfc|mfc component names.
|
*/
'make_command' => [
'type' => 'class', // Options: 'sfc', 'mfc', 'class'
'emoji' => false, // Options: true, false
'with' => [
'js' => false,
'css' => false,
'test' => false,
],
],
/*
|---------------------------------------------------------------------------
| Class Namespace
|---------------------------------------------------------------------------
|
| This value sets the root class namespace for Livewire component classes in
| your application. This value will change where component auto-discovery
| finds components. It's also referenced by the file creation commands.
|
*/
'class_namespace' => 'App\\Livewire',
/*
|---------------------------------------------------------------------------
| Class Path
|---------------------------------------------------------------------------
|
| This value is used to specify the path where Livewire component class files
| are created when running creation commands like `artisan make:livewire`.
| This path is customizable to match your projects directory structure.
|
*/
'class_path' => app_path('Livewire'),
/*
|---------------------------------------------------------------------------
| View Path
|---------------------------------------------------------------------------
|
| This value is used to specify where Livewire component Blade templates are
| stored when running file creation commands like `artisan make:livewire`.
| It is also used if you choose to omit a component's render() method.
|
*/
'view_path' => resource_path('views/livewire'),
/*
|---------------------------------------------------------------------------
| Temporary File Uploads
|---------------------------------------------------------------------------
|
| Livewire handles file uploads by storing uploads in a temporary directory
| before the file is stored permanently. All file uploads are directed to
| a global endpoint for temporary storage. You may configure this below:
|
*/
'temporary_file_upload' => [
'disk' => env('LIVEWIRE_TEMPORARY_FILE_UPLOAD_DISK'), // Example: 'local', 's3' | Default: 'default'
'rules' => null, // Example: ['file', 'mimes:png,jpg'] | Default: ['required', 'file', 'max:12288'] (12MB)
'directory' => null, // Example: 'tmp' | Default: 'livewire-tmp'
'middleware' => null, // Example: 'throttle:5,1' | Default: 'throttle:60,1'
'preview_mimes' => [ // Supported file types for temporary pre-signed file URLs...
'png', 'gif', 'bmp', 'svg', 'wav', 'mp4',
'mov', 'avi', 'wmv', 'mp3', 'm4a',
'jpg', 'jpeg', 'mpga', 'webp', 'wma',
],
'max_upload_time' => 5, // Max duration (in minutes) before an upload is invalidated...
'cleanup' => true, // Should cleanup temporary uploads older than 24 hrs...
],
/*
|---------------------------------------------------------------------------
| Render On Redirect
|---------------------------------------------------------------------------
|
| This value determines if Livewire will run a component's `render()` method
| after a redirect has been triggered using something like `redirect(...)`
| Setting this to true will render the view once more before redirecting
|
*/
'render_on_redirect' => false,
/*
|---------------------------------------------------------------------------
| Eloquent Model Binding
|---------------------------------------------------------------------------
|
| Previous versions of Livewire supported binding directly to eloquent model
| properties using wire:model by default. However, this behavior has been
| deemed too "magical" and has therefore been put under a feature flag.
|
*/
'legacy_model_binding' => false,
/*
|---------------------------------------------------------------------------
| Auto-inject Frontend Assets
|---------------------------------------------------------------------------
|
| By default, Livewire automatically injects its JavaScript and CSS into the
| <head> and <body> of pages containing Livewire components. By disabling
| this behavior, you need to use @livewireStyles and @livewireScripts.
|
*/
'inject_assets' => true,
/*
|---------------------------------------------------------------------------
| Navigate (SPA mode)
|---------------------------------------------------------------------------
|
| By adding `wire:navigate` to links in your Livewire application, Livewire
| will prevent the default link handling and instead request those pages
| via AJAX, creating an SPA-like effect. Configure this behavior here.
|
*/
'navigate' => [
'show_progress_bar' => true,
'progress_bar_color' => '#2299dd',
],
/*
|---------------------------------------------------------------------------
| HTML Morph Markers
|---------------------------------------------------------------------------
|
| Livewire intelligently "morphs" existing HTML into the newly rendered HTML
| after each update. To make this process more reliable, Livewire injects
| "markers" into the rendered Blade surrounding @if, @class & @foreach.
|
*/
'inject_morph_markers' => true,
/*
|---------------------------------------------------------------------------
| Smart Wire Keys
|---------------------------------------------------------------------------
|
| Livewire uses loops and keys used within loops to generate smart keys that
| are applied to nested components that don't have them. This makes using
| nested components more reliable by ensuring that they all have keys.
|
*/
'smart_wire_keys' => true,
/*
|---------------------------------------------------------------------------
| Pagination Theme
|---------------------------------------------------------------------------
|
| When enabling Livewire's pagination feature by using the `WithPagination`
| trait, Livewire will use Tailwind templates to render pagination views
| on the page. If you want Bootstrap CSS, you can specify: "bootstrap"
|
*/
'pagination_theme' => 'tailwind',
/*
|---------------------------------------------------------------------------
| Release Token
|---------------------------------------------------------------------------
|
| This token is stored client-side and sent along with each request to check
| a users session to see if a new release has invalidated it. If there is
| a mismatch it will throw an error and prompt for a browser refresh.
|
*/
'release_token' => 'a',
/*
|---------------------------------------------------------------------------
| CSP Safe
|---------------------------------------------------------------------------
|
| This config is used to determine if Livewire will use the CSP-safe version
| of Alpine in its bundle. This is useful for applications that are using
| strict Content Security Policy (CSP) to protect against XSS attacks.
|
*/
'csp_safe' => false,
/*
|---------------------------------------------------------------------------
| Payload Guards
|---------------------------------------------------------------------------
|
| These settings protect against malicious or oversized payloads that could
| cause denial of service. The default values should feel reasonable for
| most web applications. Each can be set to null to disable the limit.
|
*/
'payload' => [
'max_size' => 1024 * 1024, // 1MB - maximum request payload size in bytes
'max_nesting_depth' => 10, // Maximum depth of dot-notation property paths
'max_calls' => 50, // Maximum method calls per request
'max_components' => 20, // Maximum components per batch request
],
];

View file

@ -1,53 +0,0 @@
<?php
declare(strict_types=1);
namespace Database\Factories;
use App\Enums\CrawlOutcomeEnum;
use App\Models\Page;
use App\Models\PageCrawl;
use Illuminate\Database\Eloquent\Factories\Factory;
/**
* @extends Factory<PageCrawl>
*/
class PageCrawlFactory extends Factory
{
public function definition(): array
{
return [
'page_id' => null,
'domain' => 'example.com',
'priority' => 0,
'completed_at' => null,
'outcome' => null,
'status_code' => null,
'error_message' => null,
];
}
public function page(Page $page): static
{
return $this->state(fn () => [
'page_id' => $page->id,
]);
}
public function successful(): static
{
return $this->state(fn () => [
'outcome' => CrawlOutcomeEnum::Success,
'completed_at' => now(),
]);
}
public function failed(string $errorMessage): static
{
return $this->state(fn () => [
'outcome' => CrawlOutcomeEnum::Failed,
'completed_at' => now(),
'error_message' => $errorMessage,
]);
}
}

View file

@ -15,7 +15,6 @@ public function up(): void
$table->id();
$table->text('url')->unique();
$table->string('status')->default(PageStatusEnum::Discovered->value)->index();
$table->string('language', 35)->nullable()->index();
$table->string('title')->nullable();
$table->foreignId('instance_id')
->nullable()

View file

@ -1,34 +0,0 @@
<?php
declare(strict_types=1);
use Illuminate\Database\Migrations\Migration;
use Illuminate\Database\Schema\Blueprint;
use Illuminate\Support\Facades\Schema;
return new class extends Migration
{
public function up(): void
{
Schema::create('page_crawls', function (Blueprint $table) {
$table->id();
$table->foreignId('page_id')
->constrained('pages')
->cascadeOnDelete();
$table->string('domain');
$table->smallInteger('priority')->default(0);
$table->timestampTz('completed_at')->nullable();
$table->string('outcome')->nullable();
$table->smallInteger('status_code')->nullable();
$table->text('error_message')->nullable();
$table->timestampsTz();
$table->index(['page_id', 'created_at']);
});
}
public function down(): void
{
Schema::dropIfExists('page_crawls');
}
};

View file

@ -1,60 +0,0 @@
<x-layout>
<main>
<h1>About TroveBot</h1>
<p>
<strong>Trove</strong> is a federated search engine for the small web,
seeded by fediverse attention and ranked by domain coherence rather than
commercial authority. <strong>TroveBot</strong> is its crawler it
discovers and indexes URLs shared by people on the fediverse, then
follows the citations they make to find more of the small web.
</p>
<h2>Identity</h2>
<p>TroveBot identifies itself with the following User-Agent string:</p>
<pre><code>TroveBot/0.1 (+https://trove.lvl0.xyz/bot)</code></pre>
<h2>Crawling behavior</h2>
<ul>
<li>Respects <code>robots.txt</code> rules under <code>User-agent: TroveBot</code> (and the wildcard <code>User-agent: *</code> as a fallback).</li>
<li>Polite per-domain rate limit at most a few requests per minute per host.</li>
<li>Follows up to 5 redirects per URL.</li>
<li>Fetches HTML only. PDFs, images, and other binary content are recorded as discovered but never re-fetched.</li>
<li>Does not execute JavaScript, does not crawl behind authentication, does not crawl URLs containing user credentials.</li>
</ul>
<h2>Opt out</h2>
<p>
Block TroveBot entirely by adding the following to your site's
<code>robots.txt</code>:
</p>
<pre><code>User-agent: TroveBot
Disallow: /</code></pre>
<p>
Or block specific paths:
</p>
<pre><code>User-agent: TroveBot
Disallow: /private/
Disallow: /admin/</code></pre>
<h2>Contact &amp; source</h2>
<ul>
<li>
Issues, questions, abuse reports:
<a href="https://forge.lvl0.xyz/lvl0/trove/issues">forge.lvl0.xyz/lvl0/trove/issues</a>
</li>
<li>
Source code:
<a href="https://forge.lvl0.xyz/lvl0/trove">forge.lvl0.xyz/lvl0/trove</a>
</li>
</ul>
</main>
</x-layout>

View file

@ -1,18 +0,0 @@
<!DOCTYPE html>
<html lang="{{ str_replace('_', '-', app()->getLocale()) }}">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>{{ $title ?? config('app.name') }}</title>
@vite(['resources/css/app.css', 'resources/js/app.js'])
@livewireStyles
</head>
<body>
{{ $slot }}
@livewireScripts
</body>
</html>

View file

@ -1,14 +0,0 @@
<div>
@error('rate_limit') <p>{{ $message }}</p> @enderror
@if ($confirmedUrl !== null)
<p>Thanks, we've received <strong>{{ $confirmedUrl }}</strong></p>
@else
<form wire:submit="submit">
<label for="url">URL</label>
<input id="url" type="url" wire:model="url" required>
@error('url') <p>{{ $message }}</p> @enderror
<button type="submit">Submit</button>
</form>
@endif
</div>

View file

@ -1,3 +0,0 @@
<x-layout>
<livewire:url-submission-form />
</x-layout>

View file

@ -1,13 +1,7 @@
<?php
declare(strict_types=1);
use Illuminate\Support\Facades\Route;
Route::get('/', function () {
return view('welcome');
});
Route::view('/submit', 'urls.submit');
Route::view('/bot', 'bot');

View file

@ -92,10 +92,6 @@ pkgs.mkShell {
podman-compose -f $COMPOSE_FILE exec app php artisan "$@"
}
dev-composer() {
podman-compose -f $COMPOSE_FILE exec app composer "$@"
}
# ===================
# BUILD COMMANDS
# ===================
@ -145,7 +141,6 @@ pkgs.mkShell {
echo " dev-logs-redis Tail Redis logs"
echo " dev-shell Shell into app container"
echo " dev-artisan <cmd> Run artisan command"
echo " dev-composer <cmd> Run composer command"
echo " base-build Build and push image"
echo ""
echo "Services:"

View file

@ -1,330 +0,0 @@
<?php
declare(strict_types=1);
namespace Tests\Feature\Actions;
use App\Actions\FetchPageAction;
use App\Enums\CrawlOutcomeEnum;
use App\ValueObjects\FetchResult;
use GuzzleHttp\Exception\ConnectException;
use GuzzleHttp\Psr7\Request;
use Illuminate\Support\Collection;
use Illuminate\Support\Facades\Http;
use Tests\TestCase;
class FetchPageActionTest extends TestCase
{
public function test_successful_html_fetch_returns_success_outcome(): void
{
Http::fake([
'example.com/*' => Http::response(
'<html><body>Hello</body></html>',
200,
['Content-Type' => 'text/html'],
),
]);
$result = $this->makeAction()('https://example.com/page');
$this->assertInstanceOf(FetchResult::class, $result);
$this->assertSame(CrawlOutcomeEnum::Success, $result->outcome);
$this->assertSame(200, $result->statusCode);
$this->assertNotNull($result->finalUrl);
}
public function test_4xx_response_returns_blocked_4xx(): void
{
Http::fake([
'example.com/*' => Http::response('Not Found', 404),
]);
$result = $this->makeAction()('https://example.com/missing');
$this->assertInstanceOf(FetchResult::class, $result);
$this->assertSame(CrawlOutcomeEnum::Blocked4xx, $result->outcome);
$this->assertSame(404, $result->statusCode);
$this->assertIsString($result->errorMessage);
$this->assertStringContainsString('404', $result->errorMessage);
$this->assertNotNull($result->finalUrl);
}
public function test_5xx_response_returns_blocked_5xx(): void
{
Http::fake([
'example.com/*' => Http::response('Service Unavailable', 503),
]);
$result = $this->makeAction()('https://example.com/page');
$this->assertInstanceOf(FetchResult::class, $result);
$this->assertSame(CrawlOutcomeEnum::Blocked5xx, $result->outcome);
$this->assertSame(503, $result->statusCode);
$this->assertIsString($result->errorMessage);
$this->assertStringContainsString('503', $result->errorMessage);
$this->assertNotNull($result->finalUrl);
}
public function test_non_html_content_type_returns_rejected(): void
{
Http::fake([
'example.com/*' => Http::response(
'PDF binary stuff',
200,
['Content-Type' => 'application/pdf'],
),
]);
$result = $this->makeAction()('https://example.com/document.pdf');
$this->assertInstanceOf(FetchResult::class, $result);
$this->assertSame(CrawlOutcomeEnum::Rejected, $result->outcome);
$this->assertSame(200, $result->statusCode);
$this->assertIsString($result->errorMessage);
$this->assertStringContainsString('application/pdf', $result->errorMessage);
$this->assertNotNull($result->finalUrl);
$this->assertNull($result->title);
$this->assertNull($result->extractedText);
$this->assertEmpty($result->outboundLinks);
$this->assertNull($result->wordCount);
}
public function test_text_html_with_charset_is_accepted(): void
{
Http::fake([
'example.com/*' => Http::response(
'<html><body>Hello charset world</body></html>',
200,
['Content-Type' => 'text/html; charset=utf-8'],
),
]);
$result = $this->makeAction()('https://example.com/page');
$this->assertInstanceOf(FetchResult::class, $result);
$this->assertSame(CrawlOutcomeEnum::Success, $result->outcome);
$this->assertSame(200, $result->statusCode);
}
public function test_connection_failure_returns_failed(): void
{
Http::fake(function () {
throw new ConnectException(
'Could not resolve host',
new Request('GET', 'https://example.com/page'),
null,
['errno' => 6],
);
});
$result = $this->makeAction()('https://example.com/page');
$this->assertInstanceOf(FetchResult::class, $result);
$this->assertSame(CrawlOutcomeEnum::Failed, $result->outcome);
$this->assertNull($result->statusCode);
$this->assertNull($result->finalUrl);
$this->assertIsString($result->errorMessage);
$this->assertNull($result->title);
$this->assertNull($result->extractedText);
$this->assertEmpty($result->outboundLinks);
$this->assertNull($result->wordCount);
}
public function test_timeout_returns_timeout(): void
{
Http::fake(function () {
throw new ConnectException(
'cURL error 28: Operation timed out',
new Request('GET', 'https://example.com/page'),
null,
['errno' => 28],
);
});
$result = $this->makeAction()('https://example.com/page');
$this->assertInstanceOf(FetchResult::class, $result);
$this->assertSame(CrawlOutcomeEnum::Timeout, $result->outcome);
$this->assertNull($result->statusCode);
$this->assertNull($result->finalUrl);
$this->assertIsString($result->errorMessage);
}
public function test_success_extracts_title_from_html(): void
{
Http::fake([
'example.com/*' => Http::response(
'<html><head><title>My Page Title</title></head><body><p>Some content.</p></body></html>',
200,
['Content-Type' => 'text/html'],
),
]);
$result = $this->makeAction()('https://example.com/page');
$this->assertSame(CrawlOutcomeEnum::Success, $result->outcome);
$this->assertSame('My Page Title', $result->title);
}
public function test_success_extracts_main_text(): void
{
$html = <<<'HTML'
<!DOCTYPE html>
<html>
<head><title>Article Title</title></head>
<body>
<nav>Navigation links</nav>
<article>
<h1>The Real Article</h1>
<p>This is the main article body that should be extracted by readability.</p>
<p>Multiple paragraphs prove the extractor works on the full content.</p>
</article>
<footer>Site footer noise</footer>
</body>
</html>
HTML;
Http::fake([
'example.com/*' => Http::response($html, 200, ['Content-Type' => 'text/html']),
]);
$result = $this->makeAction()('https://example.com/article');
$this->assertSame(CrawlOutcomeEnum::Success, $result->outcome);
$this->assertNotNull($result->extractedText);
$this->assertStringContainsString('main article body', $result->extractedText);
}
public function test_success_extracts_and_filters_outbound_links(): void
{
$html = <<<'HTML'
<!DOCTYPE html>
<html>
<head><title>Article With Links</title></head>
<body>
<nav>
<a href="/home">Home (nav, should be filtered out by Readability scope)</a>
</nav>
<article>
<h1>Article Title</h1>
<p>This article references <a href="https://other.com/article">an external article</a>.</p>
<p>And a <a href="/related-post">relative link to a related post</a> on the same site.</p>
<p>Plus a <a href="http://192.168.1.1/admin">private IP link</a> that should be rejected.</p>
<p>And a <a href="https://user:pass@evil.com/">credentials URL</a> that should be rejected.</p>
<p>And a <a href="ftp://files.example.com/">non-http scheme</a> that should be rejected.</p>
</article>
<footer>
<a href="/privacy">Privacy (footer, filtered by Readability scope)</a>
</footer>
</body>
</html>
HTML;
Http::fake([
'example.com/*' => Http::response($html, 200, ['Content-Type' => 'text/html']),
]);
$result = $this->makeAction()('https://example.com/article');
$this->assertSame(CrawlOutcomeEnum::Success, $result->outcome);
$this->assertInstanceOf(Collection::class, $result->outboundLinks);
$this->assertSame(2, $result->outboundLinks->count());
$this->assertContains('https://other.com/article', $result->outboundLinks->all());
$this->assertContains('https://example.com/related-post', $result->outboundLinks->all());
$this->assertNotContains('http://192.168.1.1/admin', $result->outboundLinks->all());
$this->assertNotContains('https://user:pass@evil.com/', $result->outboundLinks->all());
$this->assertNotContains('ftp://files.example.com/', $result->outboundLinks->all());
}
public function test_success_calculates_word_count(): void
{
$html = <<<'HTML'
<!DOCTYPE html>
<html>
<head><title>Word Count Test</title></head>
<body>
<article>
<p>This article body has exactly nine words total here.</p>
</article>
</body>
</html>
HTML;
Http::fake([
'example.com/*' => Http::response($html, 200, ['Content-Type' => 'text/html']),
]);
$result = $this->makeAction()('https://example.com/article');
$this->assertSame(CrawlOutcomeEnum::Success, $result->outcome);
$this->assertSame(9, $result->wordCount);
}
public function test_uppercase_content_type_is_accepted_as_html(): void
{
Http::fake([
'example.com/*' => Http::response(
'<html><head><title>Uppercase CT</title></head><body><p>Content here.</p></body></html>',
200,
['Content-Type' => 'Text/HTML; charset=utf-8'],
),
]);
$result = $this->makeAction()('https://example.com/page');
$this->assertSame(CrawlOutcomeEnum::Success, $result->outcome);
}
public function test_empty_href_is_filtered_from_outbound_links(): void
{
$html = <<<'HTML'
<!DOCTYPE html>
<html>
<head><title>Empty Href Test</title></head>
<body>
<article>
<p>This paragraph has <a href="">an empty href anchor</a> that should be dropped.</p>
</article>
</body>
</html>
HTML;
Http::fake([
'example.com/*' => Http::response($html, 200, ['Content-Type' => 'text/html']),
]);
$result = $this->makeAction()('https://example.com/article');
$this->assertSame(CrawlOutcomeEnum::Success, $result->outcome);
$this->assertSame(0, $result->outboundLinks->count());
}
public function test_fragment_only_href_is_filtered_from_outbound_links(): void
{
$html = <<<'HTML'
<!DOCTYPE html>
<html>
<head><title>Fragment Href Test</title></head>
<body>
<article>
<p>Jump to <a href="#section-2">section 2</a> of this page.</p>
</article>
</body>
</html>
HTML;
Http::fake([
'example.com/*' => Http::response($html, 200, ['Content-Type' => 'text/html']),
]);
$result = $this->makeAction()('https://example.com/article');
$this->assertSame(CrawlOutcomeEnum::Success, $result->outcome);
$this->assertSame(0, $result->outboundLinks->count());
}
private function makeAction(): FetchPageAction
{
return app(FetchPageAction::class);
}
}

View file

@ -1,39 +0,0 @@
<?php
declare(strict_types=1);
namespace Tests\Feature;
use Tests\TestCase;
class BotPageTest extends TestCase
{
public function test_bot_page_renders_at_public_route(): void
{
$response = $this->get('/bot');
$response->assertStatus(200);
}
public function test_bot_page_contains_user_agent_string(): void
{
$response = $this->get('/bot');
$response->assertSee('TroveBot/0.1 (+https://trove.lvl0.xyz/bot)', escape: false);
}
public function test_bot_page_contains_robots_txt_opt_out_example(): void
{
$response = $this->get('/bot');
$response->assertSee('User-agent: TroveBot', escape: false);
$response->assertSee('Disallow: /', escape: false);
}
public function test_bot_page_links_to_forge_repository(): void
{
$response = $this->get('/bot');
$response->assertSee('https://forge.lvl0.xyz/lvl0/trove', escape: false);
}
}

View file

@ -1,393 +0,0 @@
<?php
declare(strict_types=1);
namespace Tests\Feature\Jobs;
use App\Actions\FetchPageAction;
use App\Enums\CrawlOutcomeEnum;
use App\Enums\PageStatusEnum;
use App\Jobs\ProcessCrawlJob;
use App\Models\Page;
use App\Models\PageCrawl;
use App\ValueObjects\FetchResult;
use Carbon\Carbon;
use Illuminate\Foundation\Testing\RefreshDatabase;
use Illuminate\Support\Collection;
use Illuminate\Support\Facades\Cache;
use Illuminate\Support\Facades\Queue;
use Mockery;
use Tests\TestCase;
class ProcessCrawlJobTest extends TestCase
{
use RefreshDatabase;
public function test_creating_a_page_crawl_dispatches_process_crawl_job(): void
{
Queue::fake();
$page = Page::factory()->createQuietly(['url' => 'https://example.com/article']);
PageCrawl::factory()->page($page)->create();
Queue::assertPushed(ProcessCrawlJob::class);
}
public function test_dispatched_job_carries_the_correct_page_crawl(): void
{
Queue::fake();
$page = Page::factory()->createQuietly(['url' => 'https://example.com/article']);
$crawl = PageCrawl::factory()->page($page)->create();
Queue::assertPushed(
ProcessCrawlJob::class,
fn (ProcessCrawlJob $job) => $job->pageCrawl->id === $crawl->id,
);
}
public function test_handle_writes_outcome_to_page_crawl_on_success(): void
{
Queue::fake();
$this->mockFetchPageAction(CrawlOutcomeEnum::Success, statusCode: 200, title: 'Hello', extractedText: 'hi', wordCount: 1);
$page = Page::factory()->createQuietly(['url' => 'https://example.com/article']);
$crawl = PageCrawl::factory()->page($page)->createQuietly();
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
->handle();
$fresh = $crawl->fresh();
$this->assertSame(CrawlOutcomeEnum::Success, $fresh->outcome);
$this->assertNotNull($fresh->completed_at);
$this->assertInstanceOf(Carbon::class, $fresh->completed_at);
$this->assertSame(200, $fresh->status_code);
$this->assertNull($fresh->error_message);
}
public function test_handle_updates_page_to_fetched_on_success(): void
{
Queue::fake();
$this->mockFetchPageAction(CrawlOutcomeEnum::Success, statusCode: 200, title: 'Hello', extractedText: 'hi', wordCount: 1);
$page = Page::factory()->createQuietly(['url' => 'https://example.com/article']);
$crawl = PageCrawl::factory()->page($page)->createQuietly();
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
->handle();
$fresh = $page->fresh();
$this->assertSame(PageStatusEnum::Fetched, $fresh->status);
$this->assertNotNull($fresh->fetched_at);
$this->assertInstanceOf(Carbon::class, $fresh->fetched_at);
$this->assertSame('Hello', $fresh->title);
}
public function test_handle_updates_page_to_rejected_on_rejected_outcome(): void
{
Queue::fake();
$this->mockFetchPageAction(CrawlOutcomeEnum::Rejected, statusCode: 200, errorMessage: 'Unsupported Content-Type: application/pdf');
$page = Page::factory()->createQuietly(['url' => 'https://example.com/brochure.pdf']);
$crawl = PageCrawl::factory()->page($page)->createQuietly();
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
->handle();
$fresh = $page->fresh();
$this->assertSame(PageStatusEnum::Rejected, $fresh->status);
$this->assertNull($fresh->fetched_at);
}
public function test_handle_updates_page_to_failed_on_blocked_4xx(): void
{
Queue::fake();
$this->mockFetchPageAction(CrawlOutcomeEnum::Blocked4xx, statusCode: 404, errorMessage: 'HTTP 404');
$page = Page::factory()->createQuietly(['url' => 'https://example.com/gone']);
$crawl = PageCrawl::factory()->page($page)->createQuietly();
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
->handle();
$fresh = $page->fresh();
$this->assertSame(PageStatusEnum::Failed, $fresh->status);
$this->assertNotNull($fresh->failed_at);
$this->assertInstanceOf(Carbon::class, $fresh->failed_at);
}
public function test_handle_updates_page_to_failed_on_timeout(): void
{
Queue::fake();
$this->mockFetchPageAction(CrawlOutcomeEnum::Timeout, errorMessage: 'Connection timed out after 10 seconds');
$page = Page::factory()->createQuietly(['url' => 'https://example.com/slow']);
$crawl = PageCrawl::factory()->page($page)->createQuietly();
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
->handle();
$fresh = $page->fresh();
$this->assertSame(PageStatusEnum::Failed, $fresh->status);
$this->assertNotNull($fresh->failed_at);
$this->assertInstanceOf(Carbon::class, $fresh->failed_at);
}
public function test_handle_schedules_retry_on_transient_failure(): void
{
Queue::fake();
$this->mockFetchPageAction(CrawlOutcomeEnum::Failed, errorMessage: 'Connection refused');
$page = Page::factory()->createQuietly(['url' => 'https://example.com/unstable']);
$crawl = PageCrawl::factory()->page($page)->createQuietly();
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
->handle();
// A second PageCrawl row (the retry) must have been inserted for the same page
$this->assertSame(2, PageCrawl::where('page_id', $page->id)->count());
// The new row is pending — outcome IS NULL
$retryRow = PageCrawl::where('page_id', $page->id)
->whereNull('outcome')
->first();
$this->assertNotNull($retryRow);
// A delayed ProcessCrawlJob must have been pushed for the retry row
Queue::assertPushed(
ProcessCrawlJob::class,
fn (ProcessCrawlJob $job) => $job->pageCrawl->page_id === $page->id
&& $job->pageCrawl->id === $retryRow->id,
);
}
public function test_handle_does_not_retry_after_three_attempts(): void
{
Queue::fake();
$this->mockFetchPageAction(CrawlOutcomeEnum::Failed, errorMessage: 'Connection refused');
$page = Page::factory()->createQuietly(['url' => 'https://example.com/unreachable']);
// 3 prior attempts already exist — this is the cap
PageCrawl::factory()->page($page)->failed('Connection refused')->createQuietly();
PageCrawl::factory()->page($page)->failed('Connection refused')->createQuietly();
$thirdCrawl = PageCrawl::factory()->page($page)->createQuietly();
app(ProcessCrawlJob::class, ['pageCrawl' => $thirdCrawl])
->handle();
// No 4th row must appear — retry cap reached
$this->assertSame(3, PageCrawl::where('page_id', $page->id)->count());
// No retry job dispatched
Queue::assertNotPushed(ProcessCrawlJob::class);
}
public function test_handle_writes_failed_outcome_to_page_crawl(): void
{
Queue::fake();
$this->mockFetchPageAction(CrawlOutcomeEnum::Failed, errorMessage: 'boom');
$page = Page::factory()->createQuietly(['url' => 'https://example.com/unstable']);
$crawl = PageCrawl::factory()->page($page)->createQuietly();
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
->handle();
$this->assertDatabaseHas('page_crawls', [
'id' => $crawl->id,
'outcome' => CrawlOutcomeEnum::Failed->value,
'status_code' => null,
'error_message' => 'boom',
]);
}
public function test_handle_updates_page_to_failed_on_failed_outcome(): void
{
Queue::fake();
$this->mockFetchPageAction(CrawlOutcomeEnum::Failed, errorMessage: 'Connection refused');
$page = Page::factory()->createQuietly(['url' => 'https://example.com/unreachable']);
$crawl = PageCrawl::factory()->page($page)->createQuietly();
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
->handle();
$this->assertSame(PageStatusEnum::Failed, $page->fresh()->status);
}
public function test_handle_updates_page_to_failed_on_blocked_5xx(): void
{
Queue::fake();
$this->mockFetchPageAction(CrawlOutcomeEnum::Blocked5xx, statusCode: 503, errorMessage: 'HTTP 503');
$page = Page::factory()->createQuietly(['url' => 'https://example.com/overloaded']);
$crawl = PageCrawl::factory()->page($page)->createQuietly();
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
->handle();
$this->assertSame(PageStatusEnum::Failed, $page->fresh()->status);
}
public function test_handle_updates_page_to_failed_on_blocked_robots(): void
{
Queue::fake();
$this->mockFetchPageAction(CrawlOutcomeEnum::BlockedRobots, errorMessage: 'Disallowed by robots.txt');
$page = Page::factory()->createQuietly(['url' => 'https://example.com/private']);
$crawl = PageCrawl::factory()->page($page)->createQuietly();
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
->handle();
$this->assertSame(PageStatusEnum::Failed, $page->fresh()->status);
}
public function test_handle_does_not_register_outbound_links_on_failure(): void
{
Queue::fake();
$this->mockFetchPageAction(
CrawlOutcomeEnum::Failed,
outboundLinks: collect(['https://should-not-be-registered.com/page']),
errorMessage: 'Connection refused',
);
$page = Page::factory()->createQuietly(['url' => 'https://example.com/broken']);
$crawl = PageCrawl::factory()->page($page)->createQuietly();
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
->handle();
$this->assertDatabaseMissing('pages', ['url' => 'https://should-not-be-registered.com/page']);
$this->assertSame(1, Page::count());
}
public function test_handle_registers_outbound_links_on_success(): void
{
Queue::fake();
$this->mockFetchPageAction(
CrawlOutcomeEnum::Success,
statusCode: 200,
finalUrl: 'https://source.com/article',
title: 'Source Article',
extractedText: 'some text',
outboundLinks: collect(['https://other.com/article-1', 'https://another.com/post-2']),
wordCount: 2,
);
$page = Page::factory()->createQuietly(['url' => 'https://source.com/article']);
$crawl = PageCrawl::factory()->page($page)->createQuietly();
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
->handle();
$this->assertDatabaseHas('pages', ['url' => 'https://other.com/article-1']);
$this->assertDatabaseHas('pages', ['url' => 'https://another.com/post-2']);
$this->assertSame(3, Page::count());
}
public function test_handle_releases_job_when_domain_is_locked(): void
{
Queue::fake();
// Pre-acquire the lock so the job sees it as already held
Cache::lock('crawler:domain:example.com', 10)->get();
// The fetcher must NOT be called — the job should bail before reaching it
$fetcher = Mockery::mock(FetchPageAction::class);
$fetcher->shouldNotReceive('__invoke');
$this->app->instance(FetchPageAction::class, $fetcher);
$page = Page::factory()->createQuietly(['url' => 'https://example.com/article']);
$crawl = PageCrawl::factory()->page($page)->createQuietly();
$job = new ProcessCrawlJob($crawl);
$job->handle();
// No outcome written — handle() returned early
$this->assertNull($crawl->fresh()->outcome);
// Page status unchanged from its factory default (Discovered)
$this->assertSame(PageStatusEnum::Discovered, $page->fresh()->status);
}
public function test_handle_does_not_release_lock_after_completion(): void
{
Queue::fake();
$this->mockFetchPageAction(CrawlOutcomeEnum::Success, statusCode: 200);
$page = Page::factory()->createQuietly(['url' => 'https://example.com/article']);
$crawl = PageCrawl::factory()->page($page)->createQuietly();
$job = new ProcessCrawlJob($crawl);
$job->handle();
// If handle() called $lock->release(), this second get() would succeed (true).
// It must fail (false) — the lock acquired inside handle() must still be held.
$result = Cache::lock('crawler:domain:example.com', 10)->get();
$this->assertFalse($result, 'Expected the domain lock to still be held after handle() completed, but it was free — the lock was released prematurely.');
}
public function test_handle_acquires_domain_lock_before_fetching(): void
{
Queue::fake();
$this->mockFetchPageAction(CrawlOutcomeEnum::Success, statusCode: 200);
$page = Page::factory()->createQuietly(['url' => 'https://lock-test.example.com/article']);
$crawl = PageCrawl::factory()->page($page)->createQuietly();
$domain = $crawl->domain;
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
->handle();
// The lock must still be held after handle() completes — a second attempt to acquire it fails
$this->assertFalse(
Cache::lock("crawler:domain:{$domain}", 10)->get(),
'Expected the domain lock to still be held after handle() ran, but it was free.',
);
// The fetch ran — outcome was written (proves the lock did not block execution)
$this->assertSame(CrawlOutcomeEnum::Success, $crawl->fresh()->outcome);
}
private function mockFetchPageAction(
CrawlOutcomeEnum $outcome,
?int $statusCode = null,
?string $finalUrl = 'https://example.com/article',
?string $title = null,
?string $extractedText = null,
?Collection $outboundLinks = null,
?int $wordCount = null,
?string $errorMessage = null,
): void {
$fetcher = Mockery::mock(FetchPageAction::class);
$fetcher->shouldReceive('__invoke')->andReturn(new FetchResult(
outcome: $outcome,
statusCode: $statusCode,
finalUrl: $finalUrl,
title: $title,
extractedText: $extractedText,
outboundLinks: $outboundLinks ?? collect(),
wordCount: $wordCount,
errorMessage: $errorMessage,
));
$this->app->instance(FetchPageAction::class, $fetcher);
}
}

View file

@ -1,70 +0,0 @@
<?php
declare(strict_types=1);
namespace Tests\Feature;
use App\Models\Page;
use App\Models\PageCrawl;
use App\Services\UrlService;
use Illuminate\Foundation\Testing\RefreshDatabase;
use Tests\TestCase;
class PageQueuePopulationTest extends TestCase
{
use RefreshDatabase;
public function test_creating_a_page_inserts_a_page_crawl_row(): void
{
$url = 'https://example-blog.com/article';
$page = Page::factory()->create(['url' => $url]);
$expectedDomain = (new UrlService)->host($url);
$this->assertDatabaseHas('page_crawls', [
'page_id' => $page->id,
'domain' => $expectedDomain,
'priority' => 0,
]);
$crawl = PageCrawl::where('page_id', $page->id)->first();
$this->assertNotNull($crawl);
}
public function test_first_or_create_with_existing_url_does_not_insert_duplicate_crawl(): void
{
$url = 'https://example-blog.com/article';
Page::factory()->create(['url' => $url]);
// Finds the existing row — created event does not fire again
Page::firstOrCreate(['url' => $url], ['status' => 'discovered']);
$this->assertDatabaseCount('page_crawls', 1);
}
public function test_updating_a_page_does_not_insert_another_crawl(): void
{
$page = Page::factory()->create(['url' => 'https://example-blog.com/article']);
$page->update(['title' => 'New Title']);
$this->assertDatabaseCount('page_crawls', 1);
}
public function test_bad_url_throws_exception_page_persists_no_crawl_inserted(): void
{
$caught = null;
try {
Page::create(['url' => 'not-a-url', 'status' => 'discovered']);
} catch (\InvalidArgumentException $e) {
$caught = $e;
}
$this->assertNotNull($caught, 'Expected InvalidArgumentException to be thrown');
$this->assertDatabaseHas('pages', ['url' => 'not-a-url']);
$this->assertDatabaseCount('page_crawls', 0);
}
}

View file

@ -4,6 +4,7 @@
namespace Tests\Feature;
use App\Enums\PageStatusEnum;
use App\Listeners\UrlDiscoveredListener;
use App\Models\Page;
use App\Models\PageLink;
@ -65,10 +66,15 @@ public function test_listener_creates_target_page_and_source_page_with_link(): v
// Target page
$targetPage = Page::where('url', 'https://example-blog.com/article')->first();
$this->assertNotNull($targetPage);
$this->assertSame(PageStatusEnum::Discovered, $targetPage->status);
$this->assertSame($instance->id, $targetPage->instance_id);
// Source page
$sourcePage = Page::where('url', 'https://mastodon.social/@alice/109876543210')->first();
$this->assertNotNull($sourcePage);
$this->assertSame(PageStatusEnum::Discovered, $sourcePage->status);
$this->assertSame($instance->id, $sourcePage->instance_id);
$this->assertNull($sourcePage->fetched_at);
// Edge
$link = PageLink::where('source_page_id', $sourcePage->id)
@ -109,30 +115,8 @@ public function test_listener_with_null_post_url_creates_only_target_page(): voi
$targetPage = Page::where('url', 'https://example-blog.com/article')->first();
$this->assertNotNull($targetPage);
}
// ---------------------------------------------------------------------------
// Integration — UrlDiscovered event enqueues crawls for both pages via observer
// ---------------------------------------------------------------------------
public function test_url_discovered_event_enqueues_crawls_via_observer(): void
{
$instance = $this->makeInstance();
$event = new UrlDiscovered(
url: 'https://example-blog.com/article',
instanceId: $instance->id,
discoveredAt: CarbonImmutable::parse('2026-04-26T12:00:00Z'),
postUrl: 'https://mastodon.social/@alice/109876543210',
postBody: 'check this out https://example-blog.com/article',
);
event($event);
// Listener creates 2 pages (target + source); observer fires for each → 2 crawl rows
$this->assertDatabaseCount('page_crawls', 2);
$this->assertDatabaseHas('page_crawls', ['domain' => 'example-blog.com']);
$this->assertDatabaseHas('page_crawls', ['domain' => 'mastodon.social']);
$this->assertSame(PageStatusEnum::Discovered, $targetPage->status);
$this->assertSame($instance->id, $targetPage->instance_id);
}
// ---------------------------------------------------------------------------

View file

@ -1,158 +0,0 @@
<?php
declare(strict_types=1);
namespace Tests\Feature;
use App\Enums\PageStatusEnum;
use App\Livewire\UrlSubmissionForm;
use App\Models\Page;
use Illuminate\Foundation\Testing\RefreshDatabase;
use Livewire\Livewire;
use PHPUnit\Framework\Attributes\DataProvider;
use Tests\TestCase;
class UrlSubmissionTest extends TestCase
{
use RefreshDatabase;
// -------------------------------------------------------------------------
// Test 1 — route renders the submission form
// -------------------------------------------------------------------------
public function test_submission_form_renders_at_public_route(): void
{
$response = $this->get('/submit');
$response->assertStatus(200);
$response->assertSeeLivewire('url-submission-form');
}
// -------------------------------------------------------------------------
// Test 2 — valid submission creates a page row as Discovered
// -------------------------------------------------------------------------
public function test_valid_url_submission_creates_page_as_discovered(): void
{
Livewire::test(UrlSubmissionForm::class)
->set('url', 'https://example.com/interesting-post')
->call('submit')
->assertHasNoErrors();
$this->assertDatabaseHas('pages', [
'url' => 'https://example.com/interesting-post',
]);
}
// -------------------------------------------------------------------------
// Test 3 — duplicate submission is idempotent (no second row created)
// -------------------------------------------------------------------------
public function test_duplicate_url_submission_does_not_create_second_page(): void
{
$url = 'https://example.com/seen-before';
Page::factory()->create([
'url' => $url,
'status' => PageStatusEnum::Discovered,
]);
Livewire::test(UrlSubmissionForm::class)
->set('url', $url)
->call('submit')
->assertHasNoErrors();
$this->assertDatabaseCount('pages', 1);
}
// -------------------------------------------------------------------------
// Test 4 — confirmation state echoes submitted URL
// -------------------------------------------------------------------------
public function test_confirmation_state_echoes_submitted_url(): void
{
$url = 'https://example.com/great-article';
Livewire::test(UrlSubmissionForm::class)
->set('url', $url)
->call('submit')
->assertHasNoErrors()
->assertSet('confirmedUrl', $url)
->assertSet('url', '')
->assertSee($url);
}
// -------------------------------------------------------------------------
// Test 5 — empty URL fails validation (regression lock)
// -------------------------------------------------------------------------
public function test_missing_url_fails_validation(): void
{
Livewire::test(UrlSubmissionForm::class)
->set('url', '')
->call('submit')
->assertHasErrors(['url' => 'required']);
}
// -------------------------------------------------------------------------
// Test 6 — invalid URL formats fail validation
// -------------------------------------------------------------------------
#[DataProvider('invalidUrls')]
public function test_invalid_url_formats_fail_validation(string $url): void
{
Livewire::test(UrlSubmissionForm::class)
->set('url', $url)
->call('submit')
->assertHasErrors('url');
}
public static function invalidUrls(): array
{
return [
'no scheme' => ['not-a-url'],
'disallowed scheme' => ['ftp://example.com'],
'javascript scheme' => ['javascript:alert(1)'],
];
}
// -------------------------------------------------------------------------
// Integration — form submission enqueues a crawl via PageObserver
// -------------------------------------------------------------------------
public function test_url_submission_form_enqueues_crawl_via_observer(): void
{
Livewire::test(UrlSubmissionForm::class)
->set('url', 'https://example.com/article')
->call('submit')
->assertHasNoErrors();
$this->assertDatabaseCount('page_crawls', 1);
$this->assertDatabaseHas('page_crawls', ['domain' => 'example.com']);
}
// -------------------------------------------------------------------------
// Test 7 — rate limit blocks the 11th submission within a minute
// -------------------------------------------------------------------------
public function test_rate_limit_blocks_eleventh_submission_within_a_minute(): void
{
// 10 submissions within the limit — each must succeed
for ($i = 1; $i <= 10; $i++) {
Livewire::test(UrlSubmissionForm::class)
->set('url', "https://example.com/post-{$i}")
->call('submit')
->assertHasNoErrors();
}
// 11th submission from the same IP must be blocked, with the message visible
Livewire::test(UrlSubmissionForm::class)
->set('url', 'https://example.com/post-11')
->call('submit')
->assertHasErrors('rate_limit')
->assertSee('Too many submissions');
// The 11th URL must NOT have been persisted
$this->assertDatabaseCount('pages', 10);
}
}

View file

@ -1,83 +0,0 @@
<?php
declare(strict_types=1);
namespace Tests\Unit\Actions;
use App\Actions\RegisterDiscoveredPageAction;
use App\Enums\PageStatusEnum;
use App\Models\Page;
use Illuminate\Foundation\Testing\RefreshDatabase;
use Lvl0\FediDiscover\Config\InstanceType;
use Lvl0\FediDiscover\Models\Instance;
use Tests\TestCase;
class RegisterDiscoveredPageActionTest extends TestCase
{
use RefreshDatabase;
public function test_creates_page_with_url_and_discovered_status(): void
{
$action = new RegisterDiscoveredPageAction;
$page = $action('https://example.com/article');
$this->assertInstanceOf(Page::class, $page);
$this->assertSame('https://example.com/article', $page->url);
$this->assertSame(PageStatusEnum::Discovered, $page->status);
$this->assertNull($page->instance_id);
$this->assertDatabaseHas('pages', ['url' => 'https://example.com/article']);
}
public function test_creates_page_with_provided_instance_id(): void
{
$instance = Instance::factory()
->type(InstanceType::Mastodon)
->enabled()
->create();
$action = new RegisterDiscoveredPageAction;
$page = $action('https://example.com/fediverse-post', instanceId: $instance->id);
$this->assertInstanceOf(Page::class, $page);
$this->assertSame($instance->id, $page->instance_id);
$this->assertDatabaseHas('pages', [
'url' => 'https://example.com/fediverse-post',
'instance_id' => $instance->id,
]);
}
public function test_returns_existing_page_when_url_already_exists(): void
{
$existing = Page::factory()->createQuietly([
'url' => 'https://example.com/seen-before',
'status' => PageStatusEnum::Discovered,
]);
$action = new RegisterDiscoveredPageAction;
$returned = $action('https://example.com/seen-before');
$this->assertSame($existing->id, $returned->id);
$this->assertDatabaseCount('pages', 1);
}
public function test_existing_page_status_not_overwritten_on_duplicate_call(): void
{
Page::factory()->createQuietly([
'url' => 'https://example.com/already-fetched',
'status' => PageStatusEnum::Fetched,
]);
$action = new RegisterDiscoveredPageAction;
$returned = $action('https://example.com/already-fetched');
$this->assertSame(PageStatusEnum::Fetched, $returned->status);
$this->assertDatabaseHas('pages', [
'url' => 'https://example.com/already-fetched',
'status' => PageStatusEnum::Fetched,
]);
}
}

View file

@ -1,75 +0,0 @@
<?php
declare(strict_types=1);
namespace Tests\Unit\Enums;
use App\Enums\CrawlOutcomeEnum;
use App\Enums\PageStatusEnum;
use Tests\TestCase;
class CrawlOutcomeEnumTest extends TestCase
{
public function test_all_expected_cases_exist_with_correct_backing_values(): void
{
$expected = [
'Success' => 'success',
'Failed' => 'failed',
'Timeout' => 'timeout',
'BlockedRobots' => 'blocked_robots',
'Blocked4xx' => 'blocked_4xx',
'Blocked5xx' => 'blocked_5xx',
'Rejected' => 'rejected',
];
foreach ($expected as $caseName => $backingValue) {
$case = CrawlOutcomeEnum::from($backingValue);
$this->assertSame($caseName, $case->name, "Case name for '{$backingValue}' should be '{$caseName}'");
$this->assertSame($backingValue, $case->value, "Backing value for '{$caseName}' should be '{$backingValue}'");
}
}
public function test_enum_has_exactly_seven_cases(): void
{
$this->assertCount(7, CrawlOutcomeEnum::cases());
}
public function test_to_page_status_maps_each_outcome_correctly(): void
{
$this->assertSame(PageStatusEnum::Fetched, CrawlOutcomeEnum::Success->toPageStatus());
$this->assertSame(PageStatusEnum::Rejected, CrawlOutcomeEnum::Rejected->toPageStatus());
$this->assertSame(PageStatusEnum::Failed, CrawlOutcomeEnum::Failed->toPageStatus());
$this->assertSame(PageStatusEnum::Failed, CrawlOutcomeEnum::Timeout->toPageStatus());
$this->assertSame(PageStatusEnum::Failed, CrawlOutcomeEnum::Blocked4xx->toPageStatus());
$this->assertSame(PageStatusEnum::Failed, CrawlOutcomeEnum::Blocked5xx->toPageStatus());
$this->assertSame(PageStatusEnum::Failed, CrawlOutcomeEnum::BlockedRobots->toPageStatus());
}
public function test_is_retryable_returns_true_only_for_transient_failures(): void
{
// Retryable: transient network/server problems that may resolve later
$this->assertTrue(CrawlOutcomeEnum::Failed->isRetryable());
$this->assertTrue(CrawlOutcomeEnum::Timeout->isRetryable());
$this->assertTrue(CrawlOutcomeEnum::Blocked5xx->isRetryable());
// Not retryable: success (done), permanent failures, or policy decisions
$this->assertFalse(CrawlOutcomeEnum::Success->isRetryable());
$this->assertFalse(CrawlOutcomeEnum::Rejected->isRetryable());
$this->assertFalse(CrawlOutcomeEnum::BlockedRobots->isRetryable());
$this->assertFalse(CrawlOutcomeEnum::Blocked4xx->isRetryable());
}
public function test_should_register_outbound_links_returns_true_only_for_success(): void
{
$this->assertTrue(CrawlOutcomeEnum::Success->shouldRegisterOutboundLinks());
// No links to register on any non-Success outcome
$this->assertFalse(CrawlOutcomeEnum::Failed->shouldRegisterOutboundLinks());
$this->assertFalse(CrawlOutcomeEnum::Timeout->shouldRegisterOutboundLinks());
$this->assertFalse(CrawlOutcomeEnum::Rejected->shouldRegisterOutboundLinks());
$this->assertFalse(CrawlOutcomeEnum::BlockedRobots->shouldRegisterOutboundLinks());
$this->assertFalse(CrawlOutcomeEnum::Blocked4xx->shouldRegisterOutboundLinks());
$this->assertFalse(CrawlOutcomeEnum::Blocked5xx->shouldRegisterOutboundLinks());
}
}

View file

@ -1,33 +0,0 @@
<?php
declare(strict_types=1);
namespace Tests\Unit\Enums;
use App\Enums\PageStatusEnum;
use Tests\TestCase;
class PageStatusEnumTest extends TestCase
{
public function test_all_expected_cases_exist_with_correct_backing_values(): void
{
$expected = [
'Discovered' => 'discovered',
'Fetched' => 'fetched',
'Failed' => 'failed',
'Rejected' => 'rejected',
];
foreach ($expected as $caseName => $backingValue) {
$case = PageStatusEnum::from($backingValue);
$this->assertSame($caseName, $case->name, "Case name for '{$backingValue}' should be '{$caseName}'");
$this->assertSame($backingValue, $case->value, "Backing value for '{$caseName}' should be '{$backingValue}'");
}
}
public function test_enum_has_exactly_four_cases(): void
{
$this->assertCount(4, PageStatusEnum::cases());
}
}

View file

@ -1,42 +0,0 @@
<?php
declare(strict_types=1);
namespace Tests\Unit\Models;
use App\Enums\CrawlOutcomeEnum;
use App\Models\Page;
use App\Models\PageCrawl;
use Carbon\Carbon;
use Illuminate\Foundation\Testing\RefreshDatabase;
use Illuminate\Support\Facades\Queue;
use Tests\TestCase;
class PageCrawlFactoryTest extends TestCase
{
use RefreshDatabase;
public function test_factory_successful_state_produces_success_outcome(): void
{
Queue::fake();
$page = Page::factory()->create();
$crawl = PageCrawl::factory()->page($page)->successful()->create();
$this->assertSame(CrawlOutcomeEnum::Success, $crawl->outcome);
$this->assertInstanceOf(Carbon::class, $crawl->completed_at);
$this->assertNull($crawl->error_message);
}
public function test_factory_failed_state_produces_failed_outcome_with_message(): void
{
Queue::fake();
$page = Page::factory()->create();
$crawl = PageCrawl::factory()->page($page)->failed('Connection timed out')->create();
$this->assertSame(CrawlOutcomeEnum::Failed, $crawl->outcome);
$this->assertInstanceOf(Carbon::class, $crawl->completed_at);
$this->assertSame('Connection timed out', $crawl->error_message);
}
}

View file

@ -1,111 +0,0 @@
<?php
declare(strict_types=1);
namespace Tests\Unit\Models;
use App\Enums\CrawlOutcomeEnum;
use App\Models\Page;
use App\Models\PageCrawl;
use Carbon\Carbon;
use Illuminate\Foundation\Testing\RefreshDatabase;
use Illuminate\Support\Facades\Queue;
use Tests\TestCase;
class PageCrawlTest extends TestCase
{
use RefreshDatabase;
public function test_page_crawl_fillable_fields_persist_and_casts_are_applied(): void
{
Queue::fake();
$page = Page::factory()->createQuietly(['url' => 'https://example.com/page-1']);
$completedAt = Carbon::parse('2026-05-01 10:01:05');
$crawl = PageCrawl::create([
'page_id' => $page->id,
'domain' => 'example.com',
'priority' => 5,
'completed_at' => $completedAt,
'outcome' => CrawlOutcomeEnum::Success,
'status_code' => 200,
'error_message' => null,
]);
$fresh = $crawl->fresh();
$this->assertNotNull($fresh);
// domain / priority round-trip
$this->assertSame('example.com', $fresh->domain);
$this->assertSame(5, $fresh->priority);
// outcome is cast to the enum
$this->assertInstanceOf(CrawlOutcomeEnum::class, $fresh->outcome);
$this->assertSame(CrawlOutcomeEnum::Success, $fresh->outcome);
// datetime casts
$this->assertInstanceOf(Carbon::class, $fresh->completed_at);
$this->assertTrue($completedAt->equalTo($fresh->completed_at));
// nullable columns
$this->assertNull($fresh->error_message);
// status_code persists
$this->assertSame(200, $fresh->status_code);
}
public function test_page_crawl_belongs_to_a_page(): void
{
$page = Page::factory()->createQuietly(['url' => 'https://example.com/page-2']);
$crawl = PageCrawl::create([
'page_id' => $page->id,
'domain' => 'example.com',
'priority' => 1,
]);
$related = $crawl->page;
$this->assertInstanceOf(Page::class, $related);
$this->assertSame($page->id, $related->id);
}
public function test_deleting_a_page_cascades_to_its_page_crawls(): void
{
// createQuietly() skips the PageObserver so the count of explicit rows is predictable;
// this test is about cascade delete behaviour, not observer side effects.
$page = Page::factory()->createQuietly(['url' => 'https://example.com/page-cascade']);
PageCrawl::factory()->page($page)->create();
PageCrawl::factory()->page($page)->successful()->create();
PageCrawl::factory()->page($page)->failed('timeout during fetch')->create();
$this->assertSame(3, PageCrawl::count());
$page->delete();
$this->assertSame(0, PageCrawl::count());
}
public function test_pending_crawls_are_filtered_by_null_outcome(): void
{
Queue::fake();
// createQuietly() skips the PageObserver; this test counts rows with null/non-null
// outcome — the auto-inserted observer crawl (outcome=null) would corrupt both counts.
$page = Page::factory()->createQuietly(['url' => 'https://example.com/page-pending']);
$pending = PageCrawl::factory()->page($page)->create();
PageCrawl::factory()->page($page)->successful()->create();
PageCrawl::factory()->page($page)->failed('connection refused')->create();
$this->assertSame(1, PageCrawl::whereNull('outcome')->count());
$this->assertSame($pending->id, PageCrawl::whereNull('outcome')->first()->id);
$this->assertSame(2, PageCrawl::whereNotNull('outcome')->count());
}
}

View file

@ -6,11 +6,8 @@
use App\Enums\PageStatusEnum;
use App\Models\Page;
use App\Models\PageCrawl;
use App\Models\PageLink;
use Carbon\Carbon;
use Illuminate\Foundation\Testing\RefreshDatabase;
use Illuminate\Support\Facades\Queue;
use Lvl0\FediDiscover\Config\InstanceType;
use Lvl0\FediDiscover\Models\Instance;
use Tests\TestCase;
@ -19,12 +16,6 @@ class PageTest extends TestCase
{
use RefreshDatabase;
protected function setUp(): void
{
parent::setUp();
Queue::fake();
}
public function test_page_model_fillable_fields_can_be_mass_assigned(): void
{
$page = Page::create([
@ -85,73 +76,6 @@ public function test_page_outgoing_and_incoming_links_relationships(): void
$this->assertSame($target->id, $freshSource->outgoingLinks->first()->target_page_id);
}
public function test_page_language_is_fillable_and_persists(): void
{
$page = Page::create([
'url' => 'https://example.com/crawled',
'status' => 'discovered',
'language' => 'en',
]);
$fresh = $page->fresh();
$this->assertNotNull($fresh);
$this->assertSame('en', $fresh->language);
$unset = Page::create([
'url' => 'https://example.com/no-language',
'status' => 'discovered',
]);
$this->assertNull($unset->fresh()->language);
}
public function test_page_has_many_crawls(): void
{
// createQuietly() skips the PageObserver so no auto-crawl row is inserted;
// this test is about HasMany scoping, not observer side effects.
$page = Page::factory()->createQuietly();
$other = Page::factory()->createQuietly();
PageCrawl::create(['page_id' => $page->id, 'domain' => 'example.com']);
PageCrawl::create(['page_id' => $page->id, 'domain' => 'example.com']);
PageCrawl::create(['page_id' => $page->id, 'domain' => 'example.com']);
PageCrawl::create(['page_id' => $other->id, 'domain' => 'other.com']);
$crawls = $page->fresh()->crawls;
$this->assertCount(3, $crawls);
foreach ($crawls as $crawl) {
$this->assertInstanceOf(PageCrawl::class, $crawl);
$this->assertSame($page->id, $crawl->page_id);
}
}
public function test_page_latest_crawl_returns_row_with_latest_created_at(): void
{
// createQuietly() skips the PageObserver; this test is about latestOfMany ordering,
// not observer side effects. Using create() would add an observer crawl whose
// created_at is now(), making the test fragile once the hardcoded sentinel date passes.
$page = Page::factory()->createQuietly();
$old = PageCrawl::create(['page_id' => $page->id, 'domain' => 'example.com']);
$old->created_at = Carbon::parse('2026-01-01 08:00:00');
$old->save();
$middle = PageCrawl::create(['page_id' => $page->id, 'domain' => 'example.com']);
$middle->created_at = Carbon::parse('2026-03-15 12:00:00');
$middle->save();
$newest = PageCrawl::create(['page_id' => $page->id, 'domain' => 'example.com', 'error_message' => 'sentinel-latest']);
$newest->created_at = Carbon::parse('2026-05-10 18:00:00');
$newest->save();
$latest = $page->fresh()->latestCrawl;
$this->assertInstanceOf(PageCrawl::class, $latest);
$this->assertSame('sentinel-latest', $latest->error_message);
}
public function test_page_status_is_cast_to_enum(): void
{
$cases = [

View file

@ -1,23 +0,0 @@
<?php
declare(strict_types=1);
namespace Tests\Unit\Services;
use App\Services\PolitenessService;
use Tests\TestCase;
class PolitenessServiceTest extends TestCase
{
public function test_min_delay_for_returns_config_default(): void
{
$this->assertSame(10, (new PolitenessService)->minDelayFor('example.com'));
}
public function test_min_delay_for_respects_config_override(): void
{
config()->set('crawler.min_domain_delay_seconds', 30);
$this->assertSame(30, (new PolitenessService)->minDelayFor('example.com'));
}
}

View file

@ -1,111 +0,0 @@
<?php
declare(strict_types=1);
namespace Tests\Unit\Services;
use App\Services\UrlService;
use PHPUnit\Framework\Attributes\DataProvider;
use Tests\TestCase;
class UrlServiceTest extends TestCase
{
private UrlService $service;
protected function setUp(): void
{
parent::setUp();
$this->service = new UrlService;
}
// -------------------------------------------------------------------------
// Happy path — simple URL
// -------------------------------------------------------------------------
public function test_extracts_host_from_simple_url(): void
{
$this->assertSame('example.com', $this->service->host('https://example.com'));
}
// -------------------------------------------------------------------------
// Path, query string, and fragment are ignored
// -------------------------------------------------------------------------
#[DataProvider('urlsWithNoise')]
public function test_extracts_host_ignoring_path_query_and_fragment(string $url, string $expectedHost): void
{
$this->assertSame($expectedHost, $this->service->host($url));
}
public static function urlsWithNoise(): array
{
return [
'path only' => ['https://example.com/some/path', 'example.com'],
'path and query' => ['https://example.com/page?q=hello&lang=en', 'example.com'],
'path, query, fragment' => ['https://example.com/page?q=1#section', 'example.com'],
'http scheme with path' => ['http://news.ycombinator.com/item?id=42', 'news.ycombinator.com'],
];
}
// -------------------------------------------------------------------------
// Port number is stripped from the host
// -------------------------------------------------------------------------
public function test_strips_port_from_host(): void
{
$this->assertSame('example.com', $this->service->host('https://example.com:8080/path'));
}
// -------------------------------------------------------------------------
// Host is always returned as lowercase
// -------------------------------------------------------------------------
public function test_lowercases_host(): void
{
$this->assertSame('example.com', $this->service->host('https://EXAMPLE.COM/path'));
}
// -------------------------------------------------------------------------
// Throws on malformed, disallowed, or IP-literal input
// -------------------------------------------------------------------------
#[DataProvider('invalidInputs')]
public function test_throws_on_invalid_input(string $url): void
{
$this->expectException(\InvalidArgumentException::class);
$this->service->host($url);
}
public static function invalidInputs(): array
{
return [
// malformed / missing structure
'empty string' => [''],
'no scheme' => ['example.com/path'],
'scheme only' => ['https://'],
'bare string' => ['not a url at all'],
// disallowed schemes
'javascript scheme' => ['javascript:alert(1)'],
'ftp scheme' => ['ftp://example.com'],
'data scheme' => ['data:text/html,<h1>hi</h1>'],
// IP literals — not valid page-URL hosts for Trove's purposes
'ipv4 literal' => ['https://192.168.1.1/path'],
'ipv6 literal' => ['https://[::1]/path'],
'ipv4 without path' => ['http://10.0.0.1'],
// Embedded credentials (userinfo) — phishing/SSRF flag
'embedded credentials' => ['https://user:pass@example.com/'],
'username only' => ['https://user@example.com/'],
// IPv6 with zone identifier — zone suffix defeats FILTER_VALIDATE_IP
'ipv6 with zone' => ['https://[fe80::1%25eth0]/'],
// IPv4-mapped IPv6 — FILTER_VALIDATE_IP recognises ::ffff:x.x.x.x as valid IPv6
'ipv4 mapped ipv6' => ['https://[::ffff:192.0.2.1]/path'],
];
}
}

View file

@ -1,60 +0,0 @@
<?php
declare(strict_types=1);
namespace Tests\Unit\ValueObjects;
use App\Enums\CrawlOutcomeEnum;
use App\ValueObjects\FetchResult;
use Illuminate\Support\Collection;
use PHPUnit\Framework\TestCase;
class FetchResultTest extends TestCase
{
public function test_it_exposes_all_fields(): void
{
$result = new FetchResult(
outcome: CrawlOutcomeEnum::Success,
statusCode: 200,
finalUrl: 'https://example.com/article',
title: 'An Example Article',
extractedText: 'Lorem ipsum dolor sit amet.',
outboundLinks: collect(['https://other.com', 'https://another.com']),
wordCount: 5,
errorMessage: null,
);
$this->assertSame(CrawlOutcomeEnum::Success, $result->outcome);
$this->assertSame(200, $result->statusCode);
$this->assertSame('https://example.com/article', $result->finalUrl);
$this->assertSame('An Example Article', $result->title);
$this->assertSame('Lorem ipsum dolor sit amet.', $result->extractedText);
$this->assertInstanceOf(Collection::class, $result->outboundLinks);
$this->assertSame(['https://other.com', 'https://another.com'], $result->outboundLinks->all());
$this->assertSame(5, $result->wordCount);
$this->assertNull($result->errorMessage);
}
public function test_it_accepts_null_for_failure_outcome_fields(): void
{
$result = new FetchResult(
outcome: CrawlOutcomeEnum::Failed,
statusCode: null,
finalUrl: null,
title: null,
extractedText: null,
outboundLinks: collect(),
wordCount: null,
errorMessage: 'Could not connect',
);
$this->assertSame(CrawlOutcomeEnum::Failed, $result->outcome);
$this->assertNull($result->statusCode);
$this->assertNull($result->finalUrl);
$this->assertNull($result->title);
$this->assertNull($result->extractedText);
$this->assertSame([], $result->outboundLinks->all());
$this->assertNull($result->wordCount);
$this->assertSame('Could not connect', $result->errorMessage);
}
}