Compare commits
No commits in common. "264180cd369d12a56792168266de81787d224f92" and "6b610b699eee191b0e3bfe529a1c51db8fde6970" have entirely different histories.
264180cd36
...
6b610b699e
45 changed files with 56 additions and 3010 deletions
|
|
@ -61,5 +61,3 @@ AWS_BUCKET=
|
||||||
AWS_USE_PATH_STYLE_ENDPOINT=false
|
AWS_USE_PATH_STYLE_ENDPOINT=false
|
||||||
|
|
||||||
VITE_APP_NAME="${APP_NAME}"
|
VITE_APP_NAME="${APP_NAME}"
|
||||||
|
|
||||||
CRAWLER_MIN_DOMAIN_DELAY_SECONDS=10
|
|
||||||
|
|
|
||||||
17
README.md
17
README.md
|
|
@ -36,7 +36,6 @@ ### Required environment
|
||||||
### Services you need to provide
|
### Services you need to provide
|
||||||
|
|
||||||
- **App**: pull `forge.lvl0.xyz/lvl0/trove:latest` (or a pinned `v*` tag). Exposes port `8000` inside the container. The image runs migrations and warms caches on boot.
|
- **App**: pull `forge.lvl0.xyz/lvl0/trove:latest` (or a pinned `v*` tag). Exposes port `8000` inside the container. The image runs migrations and warms caches on boot.
|
||||||
- **Worker**: same image as `app`, with `command: php artisan queue:work --tries=3 --max-time=3600`. Processes the crawler queue (URL fetching, content extraction, retries). Crawls won't actually run without this — `app` only enqueues work. **Required for the crawler to function.**
|
|
||||||
- **PostgreSQL 17**. Hostname must be reachable as `db` (default) or set `DB_HOST`. Persist `/var/lib/postgresql/data`.
|
- **PostgreSQL 17**. Hostname must be reachable as `db` (default) or set `DB_HOST`. Persist `/var/lib/postgresql/data`.
|
||||||
- **Redis 7** with `--appendonly yes` (queue jobs persist across restarts). Hostname `redis` or set `REDIS_HOST`.
|
- **Redis 7** with `--appendonly yes` (queue jobs persist across restarts). Hostname `redis` or set `REDIS_HOST`.
|
||||||
|
|
||||||
|
|
@ -72,22 +71,6 @@ ### Example compose stack
|
||||||
db: { condition: service_healthy }
|
db: { condition: service_healthy }
|
||||||
redis: { condition: service_healthy }
|
redis: { condition: service_healthy }
|
||||||
|
|
||||||
worker:
|
|
||||||
image: forge.lvl0.xyz/lvl0/trove:latest
|
|
||||||
restart: always
|
|
||||||
command: php artisan queue:work --tries=3 --max-time=3600
|
|
||||||
environment:
|
|
||||||
APP_KEY: "${APP_KEY}"
|
|
||||||
APP_URL: "${APP_URL}"
|
|
||||||
DB_DATABASE: "${DB_DATABASE}"
|
|
||||||
DB_USERNAME: "${DB_USERNAME}"
|
|
||||||
DB_PASSWORD: "${DB_PASSWORD}"
|
|
||||||
volumes:
|
|
||||||
- app_storage:/app/storage
|
|
||||||
depends_on:
|
|
||||||
db: { condition: service_healthy }
|
|
||||||
redis: { condition: service_healthy }
|
|
||||||
|
|
||||||
db:
|
db:
|
||||||
image: postgres:17-alpine
|
image: postgres:17-alpine
|
||||||
restart: always
|
restart: always
|
||||||
|
|
|
||||||
|
|
@ -1,162 +0,0 @@
|
||||||
<?php
|
|
||||||
|
|
||||||
declare(strict_types=1);
|
|
||||||
|
|
||||||
namespace App\Actions;
|
|
||||||
|
|
||||||
use App\Enums\CrawlOutcomeEnum;
|
|
||||||
use App\Services\UrlService;
|
|
||||||
use App\ValueObjects\FetchResult;
|
|
||||||
use fivefilters\Readability\Configuration;
|
|
||||||
use fivefilters\Readability\Readability;
|
|
||||||
use GuzzleHttp\Exception\ConnectException;
|
|
||||||
use Illuminate\Http\Client\ConnectionException;
|
|
||||||
use Illuminate\Http\Client\Factory;
|
|
||||||
use Illuminate\Http\Client\Response;
|
|
||||||
use InvalidArgumentException;
|
|
||||||
use League\Uri\BaseUri;
|
|
||||||
use Symfony\Component\DomCrawler\Crawler;
|
|
||||||
use Throwable;
|
|
||||||
|
|
||||||
class FetchPageAction
|
|
||||||
{
|
|
||||||
public function __construct(
|
|
||||||
private Factory $http,
|
|
||||||
private UrlService $urlService,
|
|
||||||
) {}
|
|
||||||
|
|
||||||
public function __invoke(string $url): FetchResult
|
|
||||||
{
|
|
||||||
try {
|
|
||||||
$response = $this->http
|
|
||||||
->timeout(config('crawler.timeout'))
|
|
||||||
->withHeaders([
|
|
||||||
'User-Agent' => config('crawler.user_agent'),
|
|
||||||
'Accept' => 'text/html',
|
|
||||||
])
|
|
||||||
->withOptions([
|
|
||||||
'allow_redirects' => ['max' => config('crawler.max_redirects')],
|
|
||||||
])
|
|
||||||
->get($url);
|
|
||||||
|
|
||||||
} catch (ConnectionException|ConnectException $e) {
|
|
||||||
return $this->failureResult($e);
|
|
||||||
}
|
|
||||||
|
|
||||||
[$outcome, $error] = $this->validateResponse($response);
|
|
||||||
|
|
||||||
if ($outcome === CrawlOutcomeEnum::Success) {
|
|
||||||
[$title, $extractedText, $links] = $this->extractTitleTextAndLinks($response->body(), $url);
|
|
||||||
$wordCount = $extractedText !== '' ? count(preg_split('/\s+/u', trim($extractedText))) : 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
return new FetchResult(
|
|
||||||
outcome: $outcome,
|
|
||||||
statusCode: $response->status(),
|
|
||||||
finalUrl: $url,
|
|
||||||
title: $title ?? null,
|
|
||||||
extractedText: $extractedText ?? null,
|
|
||||||
outboundLinks: $links ?? collect(),
|
|
||||||
wordCount: $wordCount ?? null,
|
|
||||||
errorMessage: $error ?? null,
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
private function validateResponse(Response $response): array
|
|
||||||
{
|
|
||||||
$status = $response->status();
|
|
||||||
|
|
||||||
if ($status >= 400 && $status < 500) {
|
|
||||||
return [CrawlOutcomeEnum::Blocked4xx, "HTTP {$status}"];
|
|
||||||
}
|
|
||||||
|
|
||||||
if ($status >= 500) {
|
|
||||||
return [CrawlOutcomeEnum::Blocked5xx, "HTTP {$status}"];
|
|
||||||
}
|
|
||||||
|
|
||||||
$contentType = $response->header('Content-Type');
|
|
||||||
if (! str_starts_with(mb_strtolower($contentType), 'text/html')) {
|
|
||||||
return [CrawlOutcomeEnum::Rejected, "Unsupported file type: {$contentType}"];
|
|
||||||
}
|
|
||||||
|
|
||||||
return [CrawlOutcomeEnum::Success, null];
|
|
||||||
}
|
|
||||||
|
|
||||||
private function failureResult(ConnectionException|ConnectException $e): FetchResult
|
|
||||||
{
|
|
||||||
$guzzleException = $e instanceof ConnectException
|
|
||||||
? $e
|
|
||||||
: ($e->getPrevious() instanceof ConnectException
|
|
||||||
? $e->getPrevious()
|
|
||||||
: null);
|
|
||||||
|
|
||||||
$errno = $guzzleException?->getHandlerContext()['errno'] ?? null;
|
|
||||||
|
|
||||||
$outcome = $errno === CURLE_OPERATION_TIMEDOUT
|
|
||||||
? CrawlOutcomeEnum::Timeout
|
|
||||||
: CrawlOutcomeEnum::Failed;
|
|
||||||
|
|
||||||
return new FetchResult(
|
|
||||||
outcome: $outcome,
|
|
||||||
statusCode: null,
|
|
||||||
finalUrl: null,
|
|
||||||
title: null,
|
|
||||||
extractedText: null,
|
|
||||||
outboundLinks: collect(),
|
|
||||||
wordCount: null,
|
|
||||||
errorMessage: $e->getMessage(),
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
private function extractTitleTextAndLinks(string $body, string $url): array
|
|
||||||
{
|
|
||||||
$crawler = new Crawler($body);
|
|
||||||
|
|
||||||
$title = $crawler->filter('title')->count() > 0
|
|
||||||
? trim($crawler->filter('title')->text())
|
|
||||||
: null;
|
|
||||||
|
|
||||||
$readability = new Readability(new Configuration);
|
|
||||||
$readability->parse($body);
|
|
||||||
$mainContent = $readability->getContent() ?? '';
|
|
||||||
$extractedText = trim(strip_tags($mainContent));
|
|
||||||
|
|
||||||
$links = collect();
|
|
||||||
if ($mainContent !== '') {
|
|
||||||
$linkCrawler = new Crawler($mainContent);
|
|
||||||
if ($linkCrawler->filter('a[href]')->count() > 0) {
|
|
||||||
$links = collect($linkCrawler->filter('a[href]')->extract(['href']));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
$linksResolved = $links
|
|
||||||
->map(fn (string $href) => $this->resolveAndValidateLink($href, $url))
|
|
||||||
->filter()
|
|
||||||
->unique()
|
|
||||||
->values();
|
|
||||||
|
|
||||||
return [$title, $extractedText, $linksResolved];
|
|
||||||
}
|
|
||||||
|
|
||||||
private function resolveAndValidateLink(string $href, string $finalUrl): ?string
|
|
||||||
{
|
|
||||||
try {
|
|
||||||
$resolved = (string) BaseUri::from($finalUrl)->resolve($href);
|
|
||||||
$resolved = strstr($resolved, '#', true) ?: $resolved;
|
|
||||||
} catch (Throwable) {
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
if ($resolved === $finalUrl) {
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
try {
|
|
||||||
$this->urlService->host($resolved);
|
|
||||||
} catch (InvalidArgumentException) {
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
return $resolved;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
@ -1,22 +0,0 @@
|
||||||
<?php
|
|
||||||
|
|
||||||
declare(strict_types=1);
|
|
||||||
|
|
||||||
namespace App\Actions;
|
|
||||||
|
|
||||||
use App\Enums\PageStatusEnum;
|
|
||||||
use App\Models\Page;
|
|
||||||
|
|
||||||
class RegisterDiscoveredPageAction
|
|
||||||
{
|
|
||||||
public function __invoke(string $url, ?int $instanceId = null): Page
|
|
||||||
{
|
|
||||||
return Page::firstOrCreate(
|
|
||||||
['url' => $url],
|
|
||||||
[
|
|
||||||
'status' => PageStatusEnum::Discovered,
|
|
||||||
'instance_id' => $instanceId,
|
|
||||||
],
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
@ -1,60 +0,0 @@
|
||||||
<?php
|
|
||||||
|
|
||||||
declare(strict_types=1);
|
|
||||||
|
|
||||||
namespace App\Enums;
|
|
||||||
|
|
||||||
enum CrawlOutcomeEnum: string
|
|
||||||
{
|
|
||||||
case Success = 'success';
|
|
||||||
case Failed = 'failed';
|
|
||||||
case Timeout = 'timeout';
|
|
||||||
case BlockedRobots = 'blocked_robots';
|
|
||||||
case Blocked4xx = 'blocked_4xx';
|
|
||||||
case Blocked5xx = 'blocked_5xx';
|
|
||||||
|
|
||||||
/**
|
|
||||||
* The HTTP fetch succeeded (2xx) but the response is unindexable in v0.1
|
|
||||||
* (non-HTML Content-Type). Worker MUST also write `pages.status = Rejected`
|
|
||||||
* on this outcome — do NOT treat as Failed. Page row STAYS in the DB to
|
|
||||||
* prevent re-discovery loops as fediverse re-shares the URL.
|
|
||||||
*/
|
|
||||||
case Rejected = 'rejected';
|
|
||||||
|
|
||||||
/**
|
|
||||||
* The PageStatusEnum value the parent `pages` row should land on for this outcome.
|
|
||||||
*/
|
|
||||||
public function toPageStatus(): PageStatusEnum
|
|
||||||
{
|
|
||||||
return match ($this) {
|
|
||||||
self::Success => PageStatusEnum::Fetched,
|
|
||||||
self::Rejected => PageStatusEnum::Rejected,
|
|
||||||
self::Failed,
|
|
||||||
self::Timeout,
|
|
||||||
self::BlockedRobots,
|
|
||||||
self::Blocked4xx,
|
|
||||||
self::Blocked5xx => PageStatusEnum::Failed,
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* True if the worker should retry this outcome (transient failures only).
|
|
||||||
* Permanent failures (4xx, robots block, rejected content type) and successes do not retry.
|
|
||||||
*/
|
|
||||||
public function isRetryable(): bool
|
|
||||||
{
|
|
||||||
return match ($this) {
|
|
||||||
self::Failed, self::Timeout, self::Blocked5xx => true,
|
|
||||||
self::Success, self::Rejected, self::BlockedRobots, self::Blocked4xx => false,
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* True if the worker should register the outbound links discovered during the fetch.
|
|
||||||
* Only Success outcomes have meaningful links; everything else either failed or returned no usable HTML.
|
|
||||||
*/
|
|
||||||
public function shouldRegisterOutboundLinks(): bool
|
|
||||||
{
|
|
||||||
return $this === self::Success;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
@ -9,12 +9,4 @@ enum PageStatusEnum: string
|
||||||
case Discovered = 'discovered';
|
case Discovered = 'discovered';
|
||||||
case Fetched = 'fetched';
|
case Fetched = 'fetched';
|
||||||
case Failed = 'failed';
|
case Failed = 'failed';
|
||||||
|
|
||||||
/**
|
|
||||||
* The crawler fetched the page but rejected it as unindexable in v0.1
|
|
||||||
* (non-HTML Content-Type). Page row stays as a sentinel preventing
|
|
||||||
* re-discovery loops; future re-crawl could flip status back to
|
|
||||||
* Discovered → Fetched if the URL starts serving HTML.
|
|
||||||
*/
|
|
||||||
case Rejected = 'rejected';
|
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -1,105 +0,0 @@
|
||||||
<?php
|
|
||||||
|
|
||||||
declare(strict_types=1);
|
|
||||||
|
|
||||||
namespace App\Jobs;
|
|
||||||
|
|
||||||
use App\Actions\FetchPageAction;
|
|
||||||
use App\Actions\RegisterDiscoveredPageAction;
|
|
||||||
use App\Enums\PageStatusEnum;
|
|
||||||
use App\Models\PageCrawl;
|
|
||||||
use App\Services\PolitenessService;
|
|
||||||
use App\ValueObjects\FetchResult;
|
|
||||||
use Illuminate\Contracts\Queue\ShouldQueue;
|
|
||||||
use Illuminate\Foundation\Queue\Queueable;
|
|
||||||
use Illuminate\Support\Facades\Cache;
|
|
||||||
|
|
||||||
class ProcessCrawlJob implements ShouldQueue
|
|
||||||
{
|
|
||||||
use Queueable;
|
|
||||||
|
|
||||||
public function __construct(
|
|
||||||
public PageCrawl $pageCrawl,
|
|
||||||
) {}
|
|
||||||
|
|
||||||
public function handle(): void
|
|
||||||
{
|
|
||||||
$fetcher = resolve(FetchPageAction::class);
|
|
||||||
$register = resolve(RegisterDiscoveredPageAction::class);
|
|
||||||
$politenessService = resolve(PolitenessService::class);
|
|
||||||
|
|
||||||
$delay = $politenessService->minDelayFor($this->pageCrawl->domain);
|
|
||||||
$lock = Cache::lock("crawler:domain:{$this->pageCrawl->domain}", $delay);
|
|
||||||
|
|
||||||
if (! $lock->get()) {
|
|
||||||
$this->release($delay);
|
|
||||||
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
$result = $fetcher($this->pageCrawl->page->url);
|
|
||||||
|
|
||||||
$this->writeOutcome($result);
|
|
||||||
$this->updatePageStatus($result);
|
|
||||||
|
|
||||||
if ($result->outcome->shouldRegisterOutboundLinks()) {
|
|
||||||
$result->outboundLinks->each(fn (string $url) => $register($url));
|
|
||||||
}
|
|
||||||
|
|
||||||
if ($result->outcome->isRetryable()) {
|
|
||||||
$this->scheduleRetryIfNeeded();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private function writeOutcome(FetchResult $result): void
|
|
||||||
{
|
|
||||||
$this->pageCrawl->update([
|
|
||||||
'outcome' => $result->outcome,
|
|
||||||
'completed_at' => now(),
|
|
||||||
'status_code' => $result->statusCode,
|
|
||||||
'error_message' => $result->errorMessage,
|
|
||||||
]);
|
|
||||||
}
|
|
||||||
|
|
||||||
private function updatePageStatus(FetchResult $result): void
|
|
||||||
{
|
|
||||||
$status = $result->outcome->toPageStatus();
|
|
||||||
|
|
||||||
$update = match ($status) {
|
|
||||||
PageStatusEnum::Fetched => [
|
|
||||||
'status' => $status,
|
|
||||||
'fetched_at' => now(),
|
|
||||||
'title' => $result->title,
|
|
||||||
],
|
|
||||||
PageStatusEnum::Failed => [
|
|
||||||
'status' => $status,
|
|
||||||
'failed_at' => now(),
|
|
||||||
],
|
|
||||||
PageStatusEnum::Rejected => [
|
|
||||||
'status' => $status,
|
|
||||||
],
|
|
||||||
PageStatusEnum::Discovered => [
|
|
||||||
'status' => $status,
|
|
||||||
],
|
|
||||||
};
|
|
||||||
|
|
||||||
$this->pageCrawl->page->update($update);
|
|
||||||
}
|
|
||||||
|
|
||||||
private function scheduleRetryIfNeeded(): void
|
|
||||||
{
|
|
||||||
if (PageCrawl::where('page_id', $this->pageCrawl->page_id)->count() >= 3) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
$newRow = PageCrawl::withoutEvents(
|
|
||||||
fn () => PageCrawl::create(
|
|
||||||
array_merge($this->pageCrawl->toArray(), [
|
|
||||||
'outcome' => null,
|
|
||||||
])
|
|
||||||
)
|
|
||||||
);
|
|
||||||
|
|
||||||
ProcessCrawlJob::dispatch($newRow)->delay(now()->addHour());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
@ -4,7 +4,8 @@
|
||||||
|
|
||||||
namespace App\Listeners;
|
namespace App\Listeners;
|
||||||
|
|
||||||
use App\Actions\RegisterDiscoveredPageAction;
|
use App\Enums\PageStatusEnum;
|
||||||
|
use App\Models\Page;
|
||||||
use App\Models\PageLink;
|
use App\Models\PageLink;
|
||||||
use Illuminate\Contracts\Queue\ShouldQueue;
|
use Illuminate\Contracts\Queue\ShouldQueue;
|
||||||
use Illuminate\Support\Facades\DB;
|
use Illuminate\Support\Facades\DB;
|
||||||
|
|
@ -12,20 +13,22 @@
|
||||||
|
|
||||||
class UrlDiscoveredListener implements ShouldQueue
|
class UrlDiscoveredListener implements ShouldQueue
|
||||||
{
|
{
|
||||||
public function __construct(
|
|
||||||
private RegisterDiscoveredPageAction $registerPage,
|
|
||||||
) {}
|
|
||||||
|
|
||||||
public function handle(UrlDiscovered $event): void
|
public function handle(UrlDiscovered $event): void
|
||||||
{
|
{
|
||||||
DB::transaction(function () use ($event) {
|
DB::transaction(function () use ($event) {
|
||||||
$targetPage = ($this->registerPage)($event->url, $event->instanceId);
|
$targetPage = Page::firstOrCreate(
|
||||||
|
['url' => $event->url],
|
||||||
|
['status' => PageStatusEnum::Discovered, 'instance_id' => $event->instanceId],
|
||||||
|
);
|
||||||
|
|
||||||
if ($event->postUrl === null || $event->postUrl === $event->url) {
|
if ($event->postUrl === null || $event->postUrl === $event->url) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
$sourcePage = ($this->registerPage)($event->postUrl, $event->instanceId);
|
$sourcePage = Page::firstOrCreate(
|
||||||
|
['url' => $event->postUrl],
|
||||||
|
['status' => PageStatusEnum::Discovered, 'instance_id' => $event->instanceId],
|
||||||
|
);
|
||||||
|
|
||||||
PageLink::firstOrCreate([
|
PageLink::firstOrCreate([
|
||||||
'source_page_id' => $sourcePage->id,
|
'source_page_id' => $sourcePage->id,
|
||||||
|
|
|
||||||
|
|
@ -1,44 +0,0 @@
|
||||||
<?php
|
|
||||||
|
|
||||||
declare(strict_types=1);
|
|
||||||
|
|
||||||
namespace App\Livewire;
|
|
||||||
|
|
||||||
use App\Actions\RegisterDiscoveredPageAction;
|
|
||||||
use Illuminate\Contracts\View\View;
|
|
||||||
use Illuminate\Support\Facades\RateLimiter;
|
|
||||||
use Livewire\Component;
|
|
||||||
|
|
||||||
class UrlSubmissionForm extends Component
|
|
||||||
{
|
|
||||||
public string $url = '';
|
|
||||||
|
|
||||||
public ?string $confirmedUrl = null;
|
|
||||||
|
|
||||||
public function submit(RegisterDiscoveredPageAction $registerPage): void
|
|
||||||
{
|
|
||||||
$key = 'submit-url:' . request()->ip();
|
|
||||||
|
|
||||||
if (RateLimiter::tooManyAttempts($key, 10)) {
|
|
||||||
$this->addError('rate_limit', 'Too many submissions, try again shortly.');
|
|
||||||
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
RateLimiter::hit($key, 60);
|
|
||||||
|
|
||||||
$validated = $this->validate([
|
|
||||||
'url' => ['required', 'url:http,https'],
|
|
||||||
]);
|
|
||||||
|
|
||||||
$registerPage($validated['url']);
|
|
||||||
|
|
||||||
$this->confirmedUrl = $validated['url'];
|
|
||||||
$this->reset('url');
|
|
||||||
}
|
|
||||||
|
|
||||||
public function render(): View
|
|
||||||
{
|
|
||||||
return view('livewire.url-submission-form');
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
@ -5,17 +5,13 @@
|
||||||
namespace App\Models;
|
namespace App\Models;
|
||||||
|
|
||||||
use App\Enums\PageStatusEnum;
|
use App\Enums\PageStatusEnum;
|
||||||
use App\Observers\PageObserver;
|
|
||||||
use Database\Factories\PageFactory;
|
use Database\Factories\PageFactory;
|
||||||
use Illuminate\Database\Eloquent\Attributes\ObservedBy;
|
|
||||||
use Illuminate\Database\Eloquent\Factories\HasFactory;
|
use Illuminate\Database\Eloquent\Factories\HasFactory;
|
||||||
use Illuminate\Database\Eloquent\Model;
|
use Illuminate\Database\Eloquent\Model;
|
||||||
use Illuminate\Database\Eloquent\Relations\BelongsTo;
|
use Illuminate\Database\Eloquent\Relations\BelongsTo;
|
||||||
use Illuminate\Database\Eloquent\Relations\HasMany;
|
use Illuminate\Database\Eloquent\Relations\HasMany;
|
||||||
use Illuminate\Database\Eloquent\Relations\HasOne;
|
|
||||||
use Lvl0\FediDiscover\Models\Instance;
|
use Lvl0\FediDiscover\Models\Instance;
|
||||||
|
|
||||||
#[ObservedBy([PageObserver::class])]
|
|
||||||
class Page extends Model
|
class Page extends Model
|
||||||
{
|
{
|
||||||
/** @use HasFactory<PageFactory> */
|
/** @use HasFactory<PageFactory> */
|
||||||
|
|
@ -24,7 +20,6 @@ class Page extends Model
|
||||||
protected $fillable = [
|
protected $fillable = [
|
||||||
'url',
|
'url',
|
||||||
'status',
|
'status',
|
||||||
'language',
|
|
||||||
'title',
|
'title',
|
||||||
'instance_id',
|
'instance_id',
|
||||||
'posted_at',
|
'posted_at',
|
||||||
|
|
@ -53,14 +48,4 @@ public function incomingLinks(): HasMany
|
||||||
{
|
{
|
||||||
return $this->hasMany(PageLink::class, 'target_page_id');
|
return $this->hasMany(PageLink::class, 'target_page_id');
|
||||||
}
|
}
|
||||||
|
|
||||||
public function crawls(): HasMany
|
|
||||||
{
|
|
||||||
return $this->hasMany(PageCrawl::class);
|
|
||||||
}
|
|
||||||
|
|
||||||
public function latestCrawl(): HasOne
|
|
||||||
{
|
|
||||||
return $this->hasOne(PageCrawl::class)->latestOfMany('created_at');
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -1,45 +0,0 @@
|
||||||
<?php
|
|
||||||
|
|
||||||
declare(strict_types=1);
|
|
||||||
|
|
||||||
namespace App\Models;
|
|
||||||
|
|
||||||
use App\Enums\CrawlOutcomeEnum;
|
|
||||||
use App\Observers\PageCrawlObserver;
|
|
||||||
use Database\Factories\PageCrawlFactory;
|
|
||||||
use Illuminate\Database\Eloquent\Attributes\ObservedBy;
|
|
||||||
use Illuminate\Database\Eloquent\Factories\HasFactory;
|
|
||||||
use Illuminate\Database\Eloquent\Model;
|
|
||||||
use Illuminate\Database\Eloquent\Relations\BelongsTo;
|
|
||||||
|
|
||||||
#[ObservedBy(PageCrawlObserver::class)]
|
|
||||||
class PageCrawl extends Model
|
|
||||||
{
|
|
||||||
/** @use HasFactory<PageCrawlFactory> */
|
|
||||||
use HasFactory;
|
|
||||||
|
|
||||||
protected $fillable = [
|
|
||||||
'page_id',
|
|
||||||
'domain',
|
|
||||||
'priority',
|
|
||||||
'completed_at',
|
|
||||||
'outcome',
|
|
||||||
'status_code',
|
|
||||||
'error_message',
|
|
||||||
];
|
|
||||||
|
|
||||||
protected $casts = [
|
|
||||||
'priority' => 'integer',
|
|
||||||
'completed_at' => 'datetime',
|
|
||||||
'outcome' => CrawlOutcomeEnum::class,
|
|
||||||
'status_code' => 'integer',
|
|
||||||
];
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @return BelongsTo<Page, $this>
|
|
||||||
*/
|
|
||||||
public function page(): BelongsTo
|
|
||||||
{
|
|
||||||
return $this->belongsTo(Page::class);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
@ -1,14 +0,0 @@
|
||||||
<?php
|
|
||||||
|
|
||||||
namespace App\Observers;
|
|
||||||
|
|
||||||
use App\Jobs\ProcessCrawlJob;
|
|
||||||
use App\Models\PageCrawl;
|
|
||||||
|
|
||||||
class PageCrawlObserver
|
|
||||||
{
|
|
||||||
public function created(PageCrawl $pageCrawl): void
|
|
||||||
{
|
|
||||||
ProcessCrawlJob::dispatch($pageCrawl);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
@ -1,25 +0,0 @@
|
||||||
<?php
|
|
||||||
|
|
||||||
declare(strict_types=1);
|
|
||||||
|
|
||||||
namespace App\Observers;
|
|
||||||
|
|
||||||
use App\Models\Page;
|
|
||||||
use App\Models\PageCrawl;
|
|
||||||
use App\Services\UrlService;
|
|
||||||
|
|
||||||
class PageObserver
|
|
||||||
{
|
|
||||||
public function __construct(private UrlService $urlService) {}
|
|
||||||
|
|
||||||
public function created(Page $page): void
|
|
||||||
{
|
|
||||||
PageCrawl::firstOrCreate(
|
|
||||||
['page_id' => $page->id],
|
|
||||||
[
|
|
||||||
'domain' => $this->urlService->host($page->url),
|
|
||||||
'priority' => 0,
|
|
||||||
],
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
@ -1,19 +0,0 @@
|
||||||
<?php
|
|
||||||
|
|
||||||
declare(strict_types=1);
|
|
||||||
|
|
||||||
namespace App\Services;
|
|
||||||
|
|
||||||
class PolitenessService
|
|
||||||
{
|
|
||||||
public function minDelayFor(string $domain): int
|
|
||||||
{
|
|
||||||
$configValue = config('crawler.min_domain_delay_seconds');
|
|
||||||
|
|
||||||
if ($configValue !== null) {
|
|
||||||
return $configValue;
|
|
||||||
}
|
|
||||||
|
|
||||||
return 10;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
@ -1,40 +0,0 @@
|
||||||
<?php
|
|
||||||
|
|
||||||
declare(strict_types=1);
|
|
||||||
|
|
||||||
namespace App\Services;
|
|
||||||
|
|
||||||
use Illuminate\Support\Uri;
|
|
||||||
use InvalidArgumentException;
|
|
||||||
|
|
||||||
class UrlService
|
|
||||||
{
|
|
||||||
public function host(string $url): string
|
|
||||||
{
|
|
||||||
$uri = Uri::of($url);
|
|
||||||
|
|
||||||
$scheme = $uri->scheme();
|
|
||||||
if ($scheme === null || $scheme === '') {
|
|
||||||
throw new InvalidArgumentException("URL has no scheme: {$url}");
|
|
||||||
}
|
|
||||||
if (! in_array($scheme, ['http', 'https'], true)) {
|
|
||||||
throw new InvalidArgumentException("Invalid URL scheme: {$scheme}");
|
|
||||||
}
|
|
||||||
|
|
||||||
if ($uri->user() !== null) {
|
|
||||||
throw new InvalidArgumentException("URLs with embedded credentials not allowed: {$url}");
|
|
||||||
}
|
|
||||||
|
|
||||||
$host = $uri->host();
|
|
||||||
if ($host === null || $host === '') {
|
|
||||||
throw new InvalidArgumentException("URL has no host: {$url}");
|
|
||||||
}
|
|
||||||
|
|
||||||
$bareHost = preg_replace('/%.*$/', '', trim($host, '[]'));
|
|
||||||
if (filter_var($bareHost, FILTER_VALIDATE_IP) !== false) {
|
|
||||||
throw new InvalidArgumentException("IP literal hosts not allowed: {$host}");
|
|
||||||
}
|
|
||||||
|
|
||||||
return mb_strtolower($host);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
@ -1,26 +0,0 @@
|
||||||
<?php
|
|
||||||
|
|
||||||
declare(strict_types=1);
|
|
||||||
|
|
||||||
namespace App\ValueObjects;
|
|
||||||
|
|
||||||
use App\Enums\CrawlOutcomeEnum;
|
|
||||||
use Illuminate\Support\Collection;
|
|
||||||
|
|
||||||
final readonly class FetchResult
|
|
||||||
{
|
|
||||||
/**
|
|
||||||
* @param ?string $finalUrl Set to the request URL in v0.1; true post-redirect URL tracking is deferred (see ticket #12 spec). Downstream consumers MUST NOT trust this field as the post-redirect location until that lands.
|
|
||||||
* @param Collection<int, string> $outboundLinks
|
|
||||||
*/
|
|
||||||
public function __construct(
|
|
||||||
public CrawlOutcomeEnum $outcome,
|
|
||||||
public ?int $statusCode,
|
|
||||||
public ?string $finalUrl,
|
|
||||||
public ?string $title,
|
|
||||||
public ?string $extractedText,
|
|
||||||
public Collection $outboundLinks,
|
|
||||||
public ?int $wordCount,
|
|
||||||
public ?string $errorMessage,
|
|
||||||
) {}
|
|
||||||
}
|
|
||||||
|
|
@ -3,7 +3,6 @@
|
||||||
use Illuminate\Foundation\Application;
|
use Illuminate\Foundation\Application;
|
||||||
use Illuminate\Foundation\Configuration\Exceptions;
|
use Illuminate\Foundation\Configuration\Exceptions;
|
||||||
use Illuminate\Foundation\Configuration\Middleware;
|
use Illuminate\Foundation\Configuration\Middleware;
|
||||||
use Illuminate\Http\Request;
|
|
||||||
|
|
||||||
return Application::configure(basePath: dirname(__DIR__))
|
return Application::configure(basePath: dirname(__DIR__))
|
||||||
->withRouting(
|
->withRouting(
|
||||||
|
|
@ -12,11 +11,7 @@
|
||||||
health: '/up',
|
health: '/up',
|
||||||
)
|
)
|
||||||
->withMiddleware(function (Middleware $middleware): void {
|
->withMiddleware(function (Middleware $middleware): void {
|
||||||
$middleware->trustProxies(
|
//
|
||||||
at: '*',
|
|
||||||
headers: Request::HEADER_X_FORWARDED_FOR
|
|
||||||
| Request::HEADER_X_FORWARDED_PROTO,
|
|
||||||
);
|
|
||||||
})
|
})
|
||||||
->withExceptions(function (Exceptions $exceptions): void {
|
->withExceptions(function (Exceptions $exceptions): void {
|
||||||
//
|
//
|
||||||
|
|
|
||||||
|
|
@ -16,12 +16,10 @@
|
||||||
],
|
],
|
||||||
"require": {
|
"require": {
|
||||||
"php": "^8.3",
|
"php": "^8.3",
|
||||||
"fivefilters/readability.php": "^3.3",
|
|
||||||
"laravel/framework": "^13.0",
|
"laravel/framework": "^13.0",
|
||||||
"laravel/tinker": "^3.0",
|
"laravel/tinker": "^3.0",
|
||||||
"livewire/livewire": "^4.2",
|
"livewire/livewire": "^4.2",
|
||||||
"lvl0/fedi-discover": "@dev",
|
"lvl0/fedi-discover": "@dev"
|
||||||
"symfony/dom-crawler": "^7.4"
|
|
||||||
},
|
},
|
||||||
"require-dev": {
|
"require-dev": {
|
||||||
"fakerphp/faker": "^1.23",
|
"fakerphp/faker": "^1.23",
|
||||||
|
|
|
||||||
276
composer.lock
generated
276
composer.lock
generated
|
|
@ -4,7 +4,7 @@
|
||||||
"Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies",
|
"Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies",
|
||||||
"This file is @generated automatically"
|
"This file is @generated automatically"
|
||||||
],
|
],
|
||||||
"content-hash": "2c63ed546b17b144997244f805e8a94a",
|
"content-hash": "e46e58784ec34415557c78db6bb6c97e",
|
||||||
"packages": [
|
"packages": [
|
||||||
{
|
{
|
||||||
"name": "brick/math",
|
"name": "brick/math",
|
||||||
|
|
@ -508,71 +508,6 @@
|
||||||
],
|
],
|
||||||
"time": "2025-03-06T22:45:56+00:00"
|
"time": "2025-03-06T22:45:56+00:00"
|
||||||
},
|
},
|
||||||
{
|
|
||||||
"name": "fivefilters/readability.php",
|
|
||||||
"version": "v3.3.3",
|
|
||||||
"source": {
|
|
||||||
"type": "git",
|
|
||||||
"url": "https://github.com/fivefilters/readability.php.git",
|
|
||||||
"reference": "e2ee7b9e49eae89ac7ed2c74b15718100a73b4c8"
|
|
||||||
},
|
|
||||||
"dist": {
|
|
||||||
"type": "zip",
|
|
||||||
"url": "https://api.github.com/repos/fivefilters/readability.php/zipball/e2ee7b9e49eae89ac7ed2c74b15718100a73b4c8",
|
|
||||||
"reference": "e2ee7b9e49eae89ac7ed2c74b15718100a73b4c8",
|
|
||||||
"shasum": ""
|
|
||||||
},
|
|
||||||
"require": {
|
|
||||||
"ext-dom": "*",
|
|
||||||
"ext-mbstring": "*",
|
|
||||||
"ext-xml": "*",
|
|
||||||
"league/uri": "^7.0",
|
|
||||||
"masterminds/html5": "^2.0",
|
|
||||||
"php": ">=8.1",
|
|
||||||
"psr/log": "^1.0 || ^2.0 || ^3.0"
|
|
||||||
},
|
|
||||||
"require-dev": {
|
|
||||||
"monolog/monolog": "^3.0",
|
|
||||||
"phpunit/phpunit": "^10.0 || ^11.0"
|
|
||||||
},
|
|
||||||
"suggest": {
|
|
||||||
"monolog/monolog": "Allow logging debug information"
|
|
||||||
},
|
|
||||||
"type": "library",
|
|
||||||
"autoload": {
|
|
||||||
"psr-4": {
|
|
||||||
"fivefilters\\Readability\\": "src/"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"notification-url": "https://packagist.org/downloads/",
|
|
||||||
"license": [
|
|
||||||
"Apache-2.0"
|
|
||||||
],
|
|
||||||
"authors": [
|
|
||||||
{
|
|
||||||
"name": "Andres Rey",
|
|
||||||
"email": "andreskrey@gmail.com",
|
|
||||||
"role": "Original Developer"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "Keyvan Minoukadeh",
|
|
||||||
"email": "keyvan@fivefilters.org",
|
|
||||||
"homepage": "https://www.fivefilters.org",
|
|
||||||
"role": "Developer/Maintainer"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"description": "A PHP port of Readability.js",
|
|
||||||
"homepage": "https://github.com/fivefilters/readability.php",
|
|
||||||
"keywords": [
|
|
||||||
"html",
|
|
||||||
"readability"
|
|
||||||
],
|
|
||||||
"support": {
|
|
||||||
"issues": "https://github.com/fivefilters/readability.php/issues",
|
|
||||||
"source": "https://github.com/fivefilters/readability.php/tree/v3.3.3"
|
|
||||||
},
|
|
||||||
"time": "2025-04-26T23:45:37+00:00"
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
"name": "fruitcake/php-cors",
|
"name": "fruitcake/php-cors",
|
||||||
"version": "v1.4.0",
|
"version": "v1.4.0",
|
||||||
|
|
@ -2167,7 +2102,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "lvl0/fedi-discover",
|
"name": "lvl0/fedi-discover",
|
||||||
"version": "dev-release/0.1.0",
|
"version": "dev-main",
|
||||||
"dist": {
|
"dist": {
|
||||||
"type": "path",
|
"type": "path",
|
||||||
"url": "packages/Lvl0/FediDiscover",
|
"url": "packages/Lvl0/FediDiscover",
|
||||||
|
|
@ -2207,73 +2142,6 @@
|
||||||
"relative": true
|
"relative": true
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{
|
|
||||||
"name": "masterminds/html5",
|
|
||||||
"version": "2.10.0",
|
|
||||||
"source": {
|
|
||||||
"type": "git",
|
|
||||||
"url": "https://github.com/Masterminds/html5-php.git",
|
|
||||||
"reference": "fcf91eb64359852f00d921887b219479b4f21251"
|
|
||||||
},
|
|
||||||
"dist": {
|
|
||||||
"type": "zip",
|
|
||||||
"url": "https://api.github.com/repos/Masterminds/html5-php/zipball/fcf91eb64359852f00d921887b219479b4f21251",
|
|
||||||
"reference": "fcf91eb64359852f00d921887b219479b4f21251",
|
|
||||||
"shasum": ""
|
|
||||||
},
|
|
||||||
"require": {
|
|
||||||
"ext-dom": "*",
|
|
||||||
"php": ">=5.3.0"
|
|
||||||
},
|
|
||||||
"require-dev": {
|
|
||||||
"phpunit/phpunit": "^4.8.35 || ^5.7.21 || ^6 || ^7 || ^8 || ^9"
|
|
||||||
},
|
|
||||||
"type": "library",
|
|
||||||
"extra": {
|
|
||||||
"branch-alias": {
|
|
||||||
"dev-master": "2.7-dev"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"autoload": {
|
|
||||||
"psr-4": {
|
|
||||||
"Masterminds\\": "src"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"notification-url": "https://packagist.org/downloads/",
|
|
||||||
"license": [
|
|
||||||
"MIT"
|
|
||||||
],
|
|
||||||
"authors": [
|
|
||||||
{
|
|
||||||
"name": "Matt Butcher",
|
|
||||||
"email": "technosophos@gmail.com"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "Matt Farina",
|
|
||||||
"email": "matt@mattfarina.com"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "Asmir Mustafic",
|
|
||||||
"email": "goetas@gmail.com"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"description": "An HTML5 parser and serializer.",
|
|
||||||
"homepage": "http://masterminds.github.io/html5-php",
|
|
||||||
"keywords": [
|
|
||||||
"HTML5",
|
|
||||||
"dom",
|
|
||||||
"html",
|
|
||||||
"parser",
|
|
||||||
"querypath",
|
|
||||||
"serializer",
|
|
||||||
"xml"
|
|
||||||
],
|
|
||||||
"support": {
|
|
||||||
"issues": "https://github.com/Masterminds/html5-php/issues",
|
|
||||||
"source": "https://github.com/Masterminds/html5-php/tree/2.10.0"
|
|
||||||
},
|
|
||||||
"time": "2025-07-25T09:04:22+00:00"
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
"name": "monolog/monolog",
|
"name": "monolog/monolog",
|
||||||
"version": "3.10.0",
|
"version": "3.10.0",
|
||||||
|
|
@ -3861,78 +3729,6 @@
|
||||||
],
|
],
|
||||||
"time": "2024-09-25T14:21:43+00:00"
|
"time": "2024-09-25T14:21:43+00:00"
|
||||||
},
|
},
|
||||||
{
|
|
||||||
"name": "symfony/dom-crawler",
|
|
||||||
"version": "v7.4.8",
|
|
||||||
"source": {
|
|
||||||
"type": "git",
|
|
||||||
"url": "https://github.com/symfony/dom-crawler.git",
|
|
||||||
"reference": "2918e7c2ba964defca1f5b69c6f74886529e2dc8"
|
|
||||||
},
|
|
||||||
"dist": {
|
|
||||||
"type": "zip",
|
|
||||||
"url": "https://api.github.com/repos/symfony/dom-crawler/zipball/2918e7c2ba964defca1f5b69c6f74886529e2dc8",
|
|
||||||
"reference": "2918e7c2ba964defca1f5b69c6f74886529e2dc8",
|
|
||||||
"shasum": ""
|
|
||||||
},
|
|
||||||
"require": {
|
|
||||||
"masterminds/html5": "^2.6",
|
|
||||||
"php": ">=8.2",
|
|
||||||
"symfony/deprecation-contracts": "^2.5|^3",
|
|
||||||
"symfony/polyfill-ctype": "~1.8",
|
|
||||||
"symfony/polyfill-mbstring": "~1.0"
|
|
||||||
},
|
|
||||||
"require-dev": {
|
|
||||||
"symfony/css-selector": "^6.4|^7.0|^8.0"
|
|
||||||
},
|
|
||||||
"type": "library",
|
|
||||||
"autoload": {
|
|
||||||
"psr-4": {
|
|
||||||
"Symfony\\Component\\DomCrawler\\": ""
|
|
||||||
},
|
|
||||||
"exclude-from-classmap": [
|
|
||||||
"/Tests/"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"notification-url": "https://packagist.org/downloads/",
|
|
||||||
"license": [
|
|
||||||
"MIT"
|
|
||||||
],
|
|
||||||
"authors": [
|
|
||||||
{
|
|
||||||
"name": "Fabien Potencier",
|
|
||||||
"email": "fabien@symfony.com"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "Symfony Community",
|
|
||||||
"homepage": "https://symfony.com/contributors"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"description": "Eases DOM navigation for HTML and XML documents",
|
|
||||||
"homepage": "https://symfony.com",
|
|
||||||
"support": {
|
|
||||||
"source": "https://github.com/symfony/dom-crawler/tree/v7.4.8"
|
|
||||||
},
|
|
||||||
"funding": [
|
|
||||||
{
|
|
||||||
"url": "https://symfony.com/sponsor",
|
|
||||||
"type": "custom"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"url": "https://github.com/fabpot",
|
|
||||||
"type": "github"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"url": "https://github.com/nicolas-grekas",
|
|
||||||
"type": "github"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"url": "https://tidelift.com/funding/github/packagist/symfony/symfony",
|
|
||||||
"type": "tidelift"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"time": "2026-03-24T13:12:05+00:00"
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
"name": "symfony/error-handler",
|
"name": "symfony/error-handler",
|
||||||
"version": "v7.4.8",
|
"version": "v7.4.8",
|
||||||
|
|
@ -4620,7 +4416,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "symfony/polyfill-ctype",
|
"name": "symfony/polyfill-ctype",
|
||||||
"version": "v1.37.0",
|
"version": "v1.36.0",
|
||||||
"source": {
|
"source": {
|
||||||
"type": "git",
|
"type": "git",
|
||||||
"url": "https://github.com/symfony/polyfill-ctype.git",
|
"url": "https://github.com/symfony/polyfill-ctype.git",
|
||||||
|
|
@ -4679,7 +4475,7 @@
|
||||||
"portable"
|
"portable"
|
||||||
],
|
],
|
||||||
"support": {
|
"support": {
|
||||||
"source": "https://github.com/symfony/polyfill-ctype/tree/v1.37.0"
|
"source": "https://github.com/symfony/polyfill-ctype/tree/v1.36.0"
|
||||||
},
|
},
|
||||||
"funding": [
|
"funding": [
|
||||||
{
|
{
|
||||||
|
|
@ -4703,16 +4499,16 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "symfony/polyfill-intl-grapheme",
|
"name": "symfony/polyfill-intl-grapheme",
|
||||||
"version": "v1.37.0",
|
"version": "v1.36.0",
|
||||||
"source": {
|
"source": {
|
||||||
"type": "git",
|
"type": "git",
|
||||||
"url": "https://github.com/symfony/polyfill-intl-grapheme.git",
|
"url": "https://github.com/symfony/polyfill-intl-grapheme.git",
|
||||||
"reference": "4864388bfbd3001ce88e234fab652acd91fdc57e"
|
"reference": "ad1b7b9092976d6c948b8a187cec9faaea9ec1df"
|
||||||
},
|
},
|
||||||
"dist": {
|
"dist": {
|
||||||
"type": "zip",
|
"type": "zip",
|
||||||
"url": "https://api.github.com/repos/symfony/polyfill-intl-grapheme/zipball/4864388bfbd3001ce88e234fab652acd91fdc57e",
|
"url": "https://api.github.com/repos/symfony/polyfill-intl-grapheme/zipball/ad1b7b9092976d6c948b8a187cec9faaea9ec1df",
|
||||||
"reference": "4864388bfbd3001ce88e234fab652acd91fdc57e",
|
"reference": "ad1b7b9092976d6c948b8a187cec9faaea9ec1df",
|
||||||
"shasum": ""
|
"shasum": ""
|
||||||
},
|
},
|
||||||
"require": {
|
"require": {
|
||||||
|
|
@ -4761,7 +4557,7 @@
|
||||||
"shim"
|
"shim"
|
||||||
],
|
],
|
||||||
"support": {
|
"support": {
|
||||||
"source": "https://github.com/symfony/polyfill-intl-grapheme/tree/v1.37.0"
|
"source": "https://github.com/symfony/polyfill-intl-grapheme/tree/v1.36.0"
|
||||||
},
|
},
|
||||||
"funding": [
|
"funding": [
|
||||||
{
|
{
|
||||||
|
|
@ -4781,11 +4577,11 @@
|
||||||
"type": "tidelift"
|
"type": "tidelift"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"time": "2026-04-26T13:13:48+00:00"
|
"time": "2026-04-10T16:19:22+00:00"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "symfony/polyfill-intl-idn",
|
"name": "symfony/polyfill-intl-idn",
|
||||||
"version": "v1.37.0",
|
"version": "v1.36.0",
|
||||||
"source": {
|
"source": {
|
||||||
"type": "git",
|
"type": "git",
|
||||||
"url": "https://github.com/symfony/polyfill-intl-idn.git",
|
"url": "https://github.com/symfony/polyfill-intl-idn.git",
|
||||||
|
|
@ -4848,7 +4644,7 @@
|
||||||
"shim"
|
"shim"
|
||||||
],
|
],
|
||||||
"support": {
|
"support": {
|
||||||
"source": "https://github.com/symfony/polyfill-intl-idn/tree/v1.37.0"
|
"source": "https://github.com/symfony/polyfill-intl-idn/tree/v1.36.0"
|
||||||
},
|
},
|
||||||
"funding": [
|
"funding": [
|
||||||
{
|
{
|
||||||
|
|
@ -4872,7 +4668,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "symfony/polyfill-intl-normalizer",
|
"name": "symfony/polyfill-intl-normalizer",
|
||||||
"version": "v1.37.0",
|
"version": "v1.36.0",
|
||||||
"source": {
|
"source": {
|
||||||
"type": "git",
|
"type": "git",
|
||||||
"url": "https://github.com/symfony/polyfill-intl-normalizer.git",
|
"url": "https://github.com/symfony/polyfill-intl-normalizer.git",
|
||||||
|
|
@ -4933,7 +4729,7 @@
|
||||||
"shim"
|
"shim"
|
||||||
],
|
],
|
||||||
"support": {
|
"support": {
|
||||||
"source": "https://github.com/symfony/polyfill-intl-normalizer/tree/v1.37.0"
|
"source": "https://github.com/symfony/polyfill-intl-normalizer/tree/v1.36.0"
|
||||||
},
|
},
|
||||||
"funding": [
|
"funding": [
|
||||||
{
|
{
|
||||||
|
|
@ -4957,7 +4753,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "symfony/polyfill-mbstring",
|
"name": "symfony/polyfill-mbstring",
|
||||||
"version": "v1.37.0",
|
"version": "v1.36.0",
|
||||||
"source": {
|
"source": {
|
||||||
"type": "git",
|
"type": "git",
|
||||||
"url": "https://github.com/symfony/polyfill-mbstring.git",
|
"url": "https://github.com/symfony/polyfill-mbstring.git",
|
||||||
|
|
@ -5018,7 +4814,7 @@
|
||||||
"shim"
|
"shim"
|
||||||
],
|
],
|
||||||
"support": {
|
"support": {
|
||||||
"source": "https://github.com/symfony/polyfill-mbstring/tree/v1.37.0"
|
"source": "https://github.com/symfony/polyfill-mbstring/tree/v1.36.0"
|
||||||
},
|
},
|
||||||
"funding": [
|
"funding": [
|
||||||
{
|
{
|
||||||
|
|
@ -5042,7 +4838,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "symfony/polyfill-php80",
|
"name": "symfony/polyfill-php80",
|
||||||
"version": "v1.37.0",
|
"version": "v1.36.0",
|
||||||
"source": {
|
"source": {
|
||||||
"type": "git",
|
"type": "git",
|
||||||
"url": "https://github.com/symfony/polyfill-php80.git",
|
"url": "https://github.com/symfony/polyfill-php80.git",
|
||||||
|
|
@ -5102,7 +4898,7 @@
|
||||||
"shim"
|
"shim"
|
||||||
],
|
],
|
||||||
"support": {
|
"support": {
|
||||||
"source": "https://github.com/symfony/polyfill-php80/tree/v1.37.0"
|
"source": "https://github.com/symfony/polyfill-php80/tree/v1.36.0"
|
||||||
},
|
},
|
||||||
"funding": [
|
"funding": [
|
||||||
{
|
{
|
||||||
|
|
@ -5126,7 +4922,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "symfony/polyfill-php83",
|
"name": "symfony/polyfill-php83",
|
||||||
"version": "v1.37.0",
|
"version": "v1.36.0",
|
||||||
"source": {
|
"source": {
|
||||||
"type": "git",
|
"type": "git",
|
||||||
"url": "https://github.com/symfony/polyfill-php83.git",
|
"url": "https://github.com/symfony/polyfill-php83.git",
|
||||||
|
|
@ -5182,7 +4978,7 @@
|
||||||
"shim"
|
"shim"
|
||||||
],
|
],
|
||||||
"support": {
|
"support": {
|
||||||
"source": "https://github.com/symfony/polyfill-php83/tree/v1.37.0"
|
"source": "https://github.com/symfony/polyfill-php83/tree/v1.36.0"
|
||||||
},
|
},
|
||||||
"funding": [
|
"funding": [
|
||||||
{
|
{
|
||||||
|
|
@ -5206,7 +5002,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "symfony/polyfill-php84",
|
"name": "symfony/polyfill-php84",
|
||||||
"version": "v1.37.0",
|
"version": "v1.36.0",
|
||||||
"source": {
|
"source": {
|
||||||
"type": "git",
|
"type": "git",
|
||||||
"url": "https://github.com/symfony/polyfill-php84.git",
|
"url": "https://github.com/symfony/polyfill-php84.git",
|
||||||
|
|
@ -5262,7 +5058,7 @@
|
||||||
"shim"
|
"shim"
|
||||||
],
|
],
|
||||||
"support": {
|
"support": {
|
||||||
"source": "https://github.com/symfony/polyfill-php84/tree/v1.37.0"
|
"source": "https://github.com/symfony/polyfill-php84/tree/v1.36.0"
|
||||||
},
|
},
|
||||||
"funding": [
|
"funding": [
|
||||||
{
|
{
|
||||||
|
|
@ -5286,16 +5082,16 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "symfony/polyfill-php85",
|
"name": "symfony/polyfill-php85",
|
||||||
"version": "v1.37.0",
|
"version": "v1.36.0",
|
||||||
"source": {
|
"source": {
|
||||||
"type": "git",
|
"type": "git",
|
||||||
"url": "https://github.com/symfony/polyfill-php85.git",
|
"url": "https://github.com/symfony/polyfill-php85.git",
|
||||||
"reference": "fcfa4973a9917cef23f2e38774da74a2b7d115ee"
|
"reference": "2c408a6bb0313e6001a83628dc5506100474254e"
|
||||||
},
|
},
|
||||||
"dist": {
|
"dist": {
|
||||||
"type": "zip",
|
"type": "zip",
|
||||||
"url": "https://api.github.com/repos/symfony/polyfill-php85/zipball/fcfa4973a9917cef23f2e38774da74a2b7d115ee",
|
"url": "https://api.github.com/repos/symfony/polyfill-php85/zipball/2c408a6bb0313e6001a83628dc5506100474254e",
|
||||||
"reference": "fcfa4973a9917cef23f2e38774da74a2b7d115ee",
|
"reference": "2c408a6bb0313e6001a83628dc5506100474254e",
|
||||||
"shasum": ""
|
"shasum": ""
|
||||||
},
|
},
|
||||||
"require": {
|
"require": {
|
||||||
|
|
@ -5342,7 +5138,7 @@
|
||||||
"shim"
|
"shim"
|
||||||
],
|
],
|
||||||
"support": {
|
"support": {
|
||||||
"source": "https://github.com/symfony/polyfill-php85/tree/v1.37.0"
|
"source": "https://github.com/symfony/polyfill-php85/tree/v1.36.0"
|
||||||
},
|
},
|
||||||
"funding": [
|
"funding": [
|
||||||
{
|
{
|
||||||
|
|
@ -5362,11 +5158,11 @@
|
||||||
"type": "tidelift"
|
"type": "tidelift"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"time": "2026-04-26T13:10:57+00:00"
|
"time": "2026-04-10T16:50:15+00:00"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "symfony/polyfill-uuid",
|
"name": "symfony/polyfill-uuid",
|
||||||
"version": "v1.37.0",
|
"version": "v1.36.0",
|
||||||
"source": {
|
"source": {
|
||||||
"type": "git",
|
"type": "git",
|
||||||
"url": "https://github.com/symfony/polyfill-uuid.git",
|
"url": "https://github.com/symfony/polyfill-uuid.git",
|
||||||
|
|
@ -5425,7 +5221,7 @@
|
||||||
"uuid"
|
"uuid"
|
||||||
],
|
],
|
||||||
"support": {
|
"support": {
|
||||||
"source": "https://github.com/symfony/polyfill-uuid/tree/v1.37.0"
|
"source": "https://github.com/symfony/polyfill-uuid/tree/v1.36.0"
|
||||||
},
|
},
|
||||||
"funding": [
|
"funding": [
|
||||||
{
|
{
|
||||||
|
|
@ -6263,16 +6059,16 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "voku/portable-ascii",
|
"name": "voku/portable-ascii",
|
||||||
"version": "2.1.1",
|
"version": "2.1.0",
|
||||||
"source": {
|
"source": {
|
||||||
"type": "git",
|
"type": "git",
|
||||||
"url": "https://github.com/voku/portable-ascii.git",
|
"url": "https://github.com/voku/portable-ascii.git",
|
||||||
"reference": "8e1051fe39379367aecf014f41744ce7539a856f"
|
"reference": "d870a33f0f79d2b4579740b0620200221ee44aeb"
|
||||||
},
|
},
|
||||||
"dist": {
|
"dist": {
|
||||||
"type": "zip",
|
"type": "zip",
|
||||||
"url": "https://api.github.com/repos/voku/portable-ascii/zipball/8e1051fe39379367aecf014f41744ce7539a856f",
|
"url": "https://api.github.com/repos/voku/portable-ascii/zipball/d870a33f0f79d2b4579740b0620200221ee44aeb",
|
||||||
"reference": "8e1051fe39379367aecf014f41744ce7539a856f",
|
"reference": "d870a33f0f79d2b4579740b0620200221ee44aeb",
|
||||||
"shasum": ""
|
"shasum": ""
|
||||||
},
|
},
|
||||||
"require": {
|
"require": {
|
||||||
|
|
@ -6309,7 +6105,7 @@
|
||||||
],
|
],
|
||||||
"support": {
|
"support": {
|
||||||
"issues": "https://github.com/voku/portable-ascii/issues",
|
"issues": "https://github.com/voku/portable-ascii/issues",
|
||||||
"source": "https://github.com/voku/portable-ascii/tree/2.1.1"
|
"source": "https://github.com/voku/portable-ascii/tree/2.1.0"
|
||||||
},
|
},
|
||||||
"funding": [
|
"funding": [
|
||||||
{
|
{
|
||||||
|
|
@ -6333,7 +6129,7 @@
|
||||||
"type": "tidelift"
|
"type": "tidelift"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"time": "2026-04-26T05:33:54+00:00"
|
"time": "2026-04-16T23:10:39+00:00"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"packages-dev": [
|
"packages-dev": [
|
||||||
|
|
|
||||||
|
|
@ -1,46 +0,0 @@
|
||||||
<?php
|
|
||||||
|
|
||||||
declare(strict_types=1);
|
|
||||||
|
|
||||||
return [
|
|
||||||
/*
|
|
||||||
|---------------------------------------------------------------------------
|
|
||||||
| HTTP timeout (seconds)
|
|
||||||
|---------------------------------------------------------------------------
|
|
||||||
|
|
|
||||||
| Hard cap on a single fetch. Guzzle's default is 0 (wait forever) — never
|
|
||||||
| acceptable for a crawler. Tune up cautiously; longer timeouts amplify the
|
|
||||||
| impact of slow targets on overall throughput.
|
|
||||||
|
|
|
||||||
*/
|
|
||||||
|
|
||||||
'timeout' => env('CRAWLER_TIMEOUT', 10),
|
|
||||||
|
|
||||||
/*
|
|
||||||
|---------------------------------------------------------------------------
|
|
||||||
| Maximum redirects to follow
|
|
||||||
|---------------------------------------------------------------------------
|
|
||||||
|
|
|
||||||
| Guzzle default is 5. Cross-origin redirects are accepted in v0.1 — the
|
|
||||||
| search engine treats the post-redirect URL as the canonical one for
|
|
||||||
| indexing.
|
|
||||||
|
|
|
||||||
*/
|
|
||||||
|
|
||||||
'max_redirects' => env('CRAWLER_MAX_REDIRECTS', 5),
|
|
||||||
|
|
||||||
/*
|
|
||||||
|---------------------------------------------------------------------------
|
|
||||||
| User-Agent
|
|
||||||
|---------------------------------------------------------------------------
|
|
||||||
|
|
|
||||||
| Identifies our crawler to target servers. The placeholder below is for
|
|
||||||
| v0.1 development; ticket #10 replaces it with the production identity
|
|
||||||
| and adds a `/bot` info page that the URL points at.
|
|
||||||
|
|
|
||||||
*/
|
|
||||||
|
|
||||||
'user_agent' => env('CRAWLER_USER_AGENT', 'TroveBot/0.1 (+https://trove.lvl0.xyz/bot)'),
|
|
||||||
|
|
||||||
'min_domain_delay_seconds' => env('CRAWLER_MIN_DOMAIN_DELAY_SECONDS', 10),
|
|
||||||
];
|
|
||||||
|
|
@ -1,282 +0,0 @@
|
||||||
<?php
|
|
||||||
|
|
||||||
return [
|
|
||||||
|
|
||||||
/*
|
|
||||||
|---------------------------------------------------------------------------
|
|
||||||
| Component Locations
|
|
||||||
|---------------------------------------------------------------------------
|
|
||||||
|
|
|
||||||
| This value sets the root directories that'll be used to resolve view-based
|
|
||||||
| components like single and multi-file components. The make command will
|
|
||||||
| use the first directory in this array to add new component files to.
|
|
||||||
|
|
|
||||||
*/
|
|
||||||
|
|
||||||
'component_locations' => [
|
|
||||||
resource_path('views/components'),
|
|
||||||
resource_path('views/livewire'),
|
|
||||||
],
|
|
||||||
|
|
||||||
/*
|
|
||||||
|---------------------------------------------------------------------------
|
|
||||||
| Component Namespaces
|
|
||||||
|---------------------------------------------------------------------------
|
|
||||||
|
|
|
||||||
| This value sets default namespaces that will be used to resolve view-based
|
|
||||||
| components like single-file and multi-file components. These folders'll
|
|
||||||
| also be referenced when creating new components via the make command.
|
|
||||||
|
|
|
||||||
*/
|
|
||||||
|
|
||||||
'component_namespaces' => [
|
|
||||||
'layouts' => resource_path('views/layouts'),
|
|
||||||
'pages' => resource_path('views/pages'),
|
|
||||||
],
|
|
||||||
|
|
||||||
/*
|
|
||||||
|---------------------------------------------------------------------------
|
|
||||||
| Page Layout
|
|
||||||
|---------------------------------------------------------------------------
|
|
||||||
| The view that will be used as the layout when rendering a single component as
|
|
||||||
| an entire page via `Route::livewire('/post/create', 'pages::create-post')`.
|
|
||||||
| In this case, the content of pages::create-post will render into $slot.
|
|
||||||
|
|
|
||||||
*/
|
|
||||||
|
|
||||||
'component_layout' => 'layouts::app',
|
|
||||||
|
|
||||||
/*
|
|
||||||
|---------------------------------------------------------------------------
|
|
||||||
| Lazy Loading Placeholder
|
|
||||||
|---------------------------------------------------------------------------
|
|
||||||
| Livewire allows you to lazy load components that would otherwise slow down
|
|
||||||
| the initial page load. Every component can have a custom placeholder or
|
|
||||||
| you can define the default placeholder view for all components below.
|
|
||||||
|
|
|
||||||
*/
|
|
||||||
|
|
||||||
'component_placeholder' => null, // Example: 'placeholders::skeleton'
|
|
||||||
|
|
||||||
/*
|
|
||||||
|---------------------------------------------------------------------------
|
|
||||||
| Make Command
|
|
||||||
|---------------------------------------------------------------------------
|
|
||||||
| This value determines the default configuration for the artisan make command
|
|
||||||
| You can configure the component type (sfc, mfc, class) and whether to use
|
|
||||||
| the high-voltage (⚡) emoji as a prefix in the sfc|mfc component names.
|
|
||||||
|
|
|
||||||
*/
|
|
||||||
|
|
||||||
'make_command' => [
|
|
||||||
'type' => 'class', // Options: 'sfc', 'mfc', 'class'
|
|
||||||
'emoji' => false, // Options: true, false
|
|
||||||
'with' => [
|
|
||||||
'js' => false,
|
|
||||||
'css' => false,
|
|
||||||
'test' => false,
|
|
||||||
],
|
|
||||||
],
|
|
||||||
|
|
||||||
/*
|
|
||||||
|---------------------------------------------------------------------------
|
|
||||||
| Class Namespace
|
|
||||||
|---------------------------------------------------------------------------
|
|
||||||
|
|
|
||||||
| This value sets the root class namespace for Livewire component classes in
|
|
||||||
| your application. This value will change where component auto-discovery
|
|
||||||
| finds components. It's also referenced by the file creation commands.
|
|
||||||
|
|
|
||||||
*/
|
|
||||||
|
|
||||||
'class_namespace' => 'App\\Livewire',
|
|
||||||
|
|
||||||
/*
|
|
||||||
|---------------------------------------------------------------------------
|
|
||||||
| Class Path
|
|
||||||
|---------------------------------------------------------------------------
|
|
||||||
|
|
|
||||||
| This value is used to specify the path where Livewire component class files
|
|
||||||
| are created when running creation commands like `artisan make:livewire`.
|
|
||||||
| This path is customizable to match your projects directory structure.
|
|
||||||
|
|
|
||||||
*/
|
|
||||||
|
|
||||||
'class_path' => app_path('Livewire'),
|
|
||||||
|
|
||||||
/*
|
|
||||||
|---------------------------------------------------------------------------
|
|
||||||
| View Path
|
|
||||||
|---------------------------------------------------------------------------
|
|
||||||
|
|
|
||||||
| This value is used to specify where Livewire component Blade templates are
|
|
||||||
| stored when running file creation commands like `artisan make:livewire`.
|
|
||||||
| It is also used if you choose to omit a component's render() method.
|
|
||||||
|
|
|
||||||
*/
|
|
||||||
|
|
||||||
'view_path' => resource_path('views/livewire'),
|
|
||||||
|
|
||||||
/*
|
|
||||||
|---------------------------------------------------------------------------
|
|
||||||
| Temporary File Uploads
|
|
||||||
|---------------------------------------------------------------------------
|
|
||||||
|
|
|
||||||
| Livewire handles file uploads by storing uploads in a temporary directory
|
|
||||||
| before the file is stored permanently. All file uploads are directed to
|
|
||||||
| a global endpoint for temporary storage. You may configure this below:
|
|
||||||
|
|
|
||||||
*/
|
|
||||||
|
|
||||||
'temporary_file_upload' => [
|
|
||||||
'disk' => env('LIVEWIRE_TEMPORARY_FILE_UPLOAD_DISK'), // Example: 'local', 's3' | Default: 'default'
|
|
||||||
'rules' => null, // Example: ['file', 'mimes:png,jpg'] | Default: ['required', 'file', 'max:12288'] (12MB)
|
|
||||||
'directory' => null, // Example: 'tmp' | Default: 'livewire-tmp'
|
|
||||||
'middleware' => null, // Example: 'throttle:5,1' | Default: 'throttle:60,1'
|
|
||||||
'preview_mimes' => [ // Supported file types for temporary pre-signed file URLs...
|
|
||||||
'png', 'gif', 'bmp', 'svg', 'wav', 'mp4',
|
|
||||||
'mov', 'avi', 'wmv', 'mp3', 'm4a',
|
|
||||||
'jpg', 'jpeg', 'mpga', 'webp', 'wma',
|
|
||||||
],
|
|
||||||
'max_upload_time' => 5, // Max duration (in minutes) before an upload is invalidated...
|
|
||||||
'cleanup' => true, // Should cleanup temporary uploads older than 24 hrs...
|
|
||||||
],
|
|
||||||
|
|
||||||
/*
|
|
||||||
|---------------------------------------------------------------------------
|
|
||||||
| Render On Redirect
|
|
||||||
|---------------------------------------------------------------------------
|
|
||||||
|
|
|
||||||
| This value determines if Livewire will run a component's `render()` method
|
|
||||||
| after a redirect has been triggered using something like `redirect(...)`
|
|
||||||
| Setting this to true will render the view once more before redirecting
|
|
||||||
|
|
|
||||||
*/
|
|
||||||
|
|
||||||
'render_on_redirect' => false,
|
|
||||||
|
|
||||||
/*
|
|
||||||
|---------------------------------------------------------------------------
|
|
||||||
| Eloquent Model Binding
|
|
||||||
|---------------------------------------------------------------------------
|
|
||||||
|
|
|
||||||
| Previous versions of Livewire supported binding directly to eloquent model
|
|
||||||
| properties using wire:model by default. However, this behavior has been
|
|
||||||
| deemed too "magical" and has therefore been put under a feature flag.
|
|
||||||
|
|
|
||||||
*/
|
|
||||||
|
|
||||||
'legacy_model_binding' => false,
|
|
||||||
|
|
||||||
/*
|
|
||||||
|---------------------------------------------------------------------------
|
|
||||||
| Auto-inject Frontend Assets
|
|
||||||
|---------------------------------------------------------------------------
|
|
||||||
|
|
|
||||||
| By default, Livewire automatically injects its JavaScript and CSS into the
|
|
||||||
| <head> and <body> of pages containing Livewire components. By disabling
|
|
||||||
| this behavior, you need to use @livewireStyles and @livewireScripts.
|
|
||||||
|
|
|
||||||
*/
|
|
||||||
|
|
||||||
'inject_assets' => true,
|
|
||||||
|
|
||||||
/*
|
|
||||||
|---------------------------------------------------------------------------
|
|
||||||
| Navigate (SPA mode)
|
|
||||||
|---------------------------------------------------------------------------
|
|
||||||
|
|
|
||||||
| By adding `wire:navigate` to links in your Livewire application, Livewire
|
|
||||||
| will prevent the default link handling and instead request those pages
|
|
||||||
| via AJAX, creating an SPA-like effect. Configure this behavior here.
|
|
||||||
|
|
|
||||||
*/
|
|
||||||
|
|
||||||
'navigate' => [
|
|
||||||
'show_progress_bar' => true,
|
|
||||||
'progress_bar_color' => '#2299dd',
|
|
||||||
],
|
|
||||||
|
|
||||||
/*
|
|
||||||
|---------------------------------------------------------------------------
|
|
||||||
| HTML Morph Markers
|
|
||||||
|---------------------------------------------------------------------------
|
|
||||||
|
|
|
||||||
| Livewire intelligently "morphs" existing HTML into the newly rendered HTML
|
|
||||||
| after each update. To make this process more reliable, Livewire injects
|
|
||||||
| "markers" into the rendered Blade surrounding @if, @class & @foreach.
|
|
||||||
|
|
|
||||||
*/
|
|
||||||
|
|
||||||
'inject_morph_markers' => true,
|
|
||||||
|
|
||||||
/*
|
|
||||||
|---------------------------------------------------------------------------
|
|
||||||
| Smart Wire Keys
|
|
||||||
|---------------------------------------------------------------------------
|
|
||||||
|
|
|
||||||
| Livewire uses loops and keys used within loops to generate smart keys that
|
|
||||||
| are applied to nested components that don't have them. This makes using
|
|
||||||
| nested components more reliable by ensuring that they all have keys.
|
|
||||||
|
|
|
||||||
*/
|
|
||||||
|
|
||||||
'smart_wire_keys' => true,
|
|
||||||
|
|
||||||
/*
|
|
||||||
|---------------------------------------------------------------------------
|
|
||||||
| Pagination Theme
|
|
||||||
|---------------------------------------------------------------------------
|
|
||||||
|
|
|
||||||
| When enabling Livewire's pagination feature by using the `WithPagination`
|
|
||||||
| trait, Livewire will use Tailwind templates to render pagination views
|
|
||||||
| on the page. If you want Bootstrap CSS, you can specify: "bootstrap"
|
|
||||||
|
|
|
||||||
*/
|
|
||||||
|
|
||||||
'pagination_theme' => 'tailwind',
|
|
||||||
|
|
||||||
/*
|
|
||||||
|---------------------------------------------------------------------------
|
|
||||||
| Release Token
|
|
||||||
|---------------------------------------------------------------------------
|
|
||||||
|
|
|
||||||
| This token is stored client-side and sent along with each request to check
|
|
||||||
| a users session to see if a new release has invalidated it. If there is
|
|
||||||
| a mismatch it will throw an error and prompt for a browser refresh.
|
|
||||||
|
|
|
||||||
*/
|
|
||||||
|
|
||||||
'release_token' => 'a',
|
|
||||||
|
|
||||||
/*
|
|
||||||
|---------------------------------------------------------------------------
|
|
||||||
| CSP Safe
|
|
||||||
|---------------------------------------------------------------------------
|
|
||||||
|
|
|
||||||
| This config is used to determine if Livewire will use the CSP-safe version
|
|
||||||
| of Alpine in its bundle. This is useful for applications that are using
|
|
||||||
| strict Content Security Policy (CSP) to protect against XSS attacks.
|
|
||||||
|
|
|
||||||
*/
|
|
||||||
|
|
||||||
'csp_safe' => false,
|
|
||||||
|
|
||||||
/*
|
|
||||||
|---------------------------------------------------------------------------
|
|
||||||
| Payload Guards
|
|
||||||
|---------------------------------------------------------------------------
|
|
||||||
|
|
|
||||||
| These settings protect against malicious or oversized payloads that could
|
|
||||||
| cause denial of service. The default values should feel reasonable for
|
|
||||||
| most web applications. Each can be set to null to disable the limit.
|
|
||||||
|
|
|
||||||
*/
|
|
||||||
|
|
||||||
'payload' => [
|
|
||||||
'max_size' => 1024 * 1024, // 1MB - maximum request payload size in bytes
|
|
||||||
'max_nesting_depth' => 10, // Maximum depth of dot-notation property paths
|
|
||||||
'max_calls' => 50, // Maximum method calls per request
|
|
||||||
'max_components' => 20, // Maximum components per batch request
|
|
||||||
],
|
|
||||||
];
|
|
||||||
|
|
@ -1,53 +0,0 @@
|
||||||
<?php
|
|
||||||
|
|
||||||
declare(strict_types=1);
|
|
||||||
|
|
||||||
namespace Database\Factories;
|
|
||||||
|
|
||||||
use App\Enums\CrawlOutcomeEnum;
|
|
||||||
use App\Models\Page;
|
|
||||||
use App\Models\PageCrawl;
|
|
||||||
use Illuminate\Database\Eloquent\Factories\Factory;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @extends Factory<PageCrawl>
|
|
||||||
*/
|
|
||||||
class PageCrawlFactory extends Factory
|
|
||||||
{
|
|
||||||
public function definition(): array
|
|
||||||
{
|
|
||||||
return [
|
|
||||||
'page_id' => null,
|
|
||||||
'domain' => 'example.com',
|
|
||||||
'priority' => 0,
|
|
||||||
'completed_at' => null,
|
|
||||||
'outcome' => null,
|
|
||||||
'status_code' => null,
|
|
||||||
'error_message' => null,
|
|
||||||
];
|
|
||||||
}
|
|
||||||
|
|
||||||
public function page(Page $page): static
|
|
||||||
{
|
|
||||||
return $this->state(fn () => [
|
|
||||||
'page_id' => $page->id,
|
|
||||||
]);
|
|
||||||
}
|
|
||||||
|
|
||||||
public function successful(): static
|
|
||||||
{
|
|
||||||
return $this->state(fn () => [
|
|
||||||
'outcome' => CrawlOutcomeEnum::Success,
|
|
||||||
'completed_at' => now(),
|
|
||||||
]);
|
|
||||||
}
|
|
||||||
|
|
||||||
public function failed(string $errorMessage): static
|
|
||||||
{
|
|
||||||
return $this->state(fn () => [
|
|
||||||
'outcome' => CrawlOutcomeEnum::Failed,
|
|
||||||
'completed_at' => now(),
|
|
||||||
'error_message' => $errorMessage,
|
|
||||||
]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
@ -15,7 +15,6 @@ public function up(): void
|
||||||
$table->id();
|
$table->id();
|
||||||
$table->text('url')->unique();
|
$table->text('url')->unique();
|
||||||
$table->string('status')->default(PageStatusEnum::Discovered->value)->index();
|
$table->string('status')->default(PageStatusEnum::Discovered->value)->index();
|
||||||
$table->string('language', 35)->nullable()->index();
|
|
||||||
$table->string('title')->nullable();
|
$table->string('title')->nullable();
|
||||||
$table->foreignId('instance_id')
|
$table->foreignId('instance_id')
|
||||||
->nullable()
|
->nullable()
|
||||||
|
|
|
||||||
|
|
@ -1,34 +0,0 @@
|
||||||
<?php
|
|
||||||
|
|
||||||
declare(strict_types=1);
|
|
||||||
|
|
||||||
use Illuminate\Database\Migrations\Migration;
|
|
||||||
use Illuminate\Database\Schema\Blueprint;
|
|
||||||
use Illuminate\Support\Facades\Schema;
|
|
||||||
|
|
||||||
return new class extends Migration
|
|
||||||
{
|
|
||||||
public function up(): void
|
|
||||||
{
|
|
||||||
Schema::create('page_crawls', function (Blueprint $table) {
|
|
||||||
$table->id();
|
|
||||||
$table->foreignId('page_id')
|
|
||||||
->constrained('pages')
|
|
||||||
->cascadeOnDelete();
|
|
||||||
$table->string('domain');
|
|
||||||
$table->smallInteger('priority')->default(0);
|
|
||||||
$table->timestampTz('completed_at')->nullable();
|
|
||||||
$table->string('outcome')->nullable();
|
|
||||||
$table->smallInteger('status_code')->nullable();
|
|
||||||
$table->text('error_message')->nullable();
|
|
||||||
$table->timestampsTz();
|
|
||||||
|
|
||||||
$table->index(['page_id', 'created_at']);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
public function down(): void
|
|
||||||
{
|
|
||||||
Schema::dropIfExists('page_crawls');
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
@ -1,60 +0,0 @@
|
||||||
<x-layout>
|
|
||||||
<main>
|
|
||||||
<h1>About TroveBot</h1>
|
|
||||||
|
|
||||||
<p>
|
|
||||||
<strong>Trove</strong> is a federated search engine for the small web,
|
|
||||||
seeded by fediverse attention and ranked by domain coherence rather than
|
|
||||||
commercial authority. <strong>TroveBot</strong> is its crawler — it
|
|
||||||
discovers and indexes URLs shared by people on the fediverse, then
|
|
||||||
follows the citations they make to find more of the small web.
|
|
||||||
</p>
|
|
||||||
|
|
||||||
<h2>Identity</h2>
|
|
||||||
|
|
||||||
<p>TroveBot identifies itself with the following User-Agent string:</p>
|
|
||||||
|
|
||||||
<pre><code>TroveBot/0.1 (+https://trove.lvl0.xyz/bot)</code></pre>
|
|
||||||
|
|
||||||
<h2>Crawling behavior</h2>
|
|
||||||
|
|
||||||
<ul>
|
|
||||||
<li>Respects <code>robots.txt</code> rules under <code>User-agent: TroveBot</code> (and the wildcard <code>User-agent: *</code> as a fallback).</li>
|
|
||||||
<li>Polite per-domain rate limit — at most a few requests per minute per host.</li>
|
|
||||||
<li>Follows up to 5 redirects per URL.</li>
|
|
||||||
<li>Fetches HTML only. PDFs, images, and other binary content are recorded as discovered but never re-fetched.</li>
|
|
||||||
<li>Does not execute JavaScript, does not crawl behind authentication, does not crawl URLs containing user credentials.</li>
|
|
||||||
</ul>
|
|
||||||
|
|
||||||
<h2>Opt out</h2>
|
|
||||||
|
|
||||||
<p>
|
|
||||||
Block TroveBot entirely by adding the following to your site's
|
|
||||||
<code>robots.txt</code>:
|
|
||||||
</p>
|
|
||||||
|
|
||||||
<pre><code>User-agent: TroveBot
|
|
||||||
Disallow: /</code></pre>
|
|
||||||
|
|
||||||
<p>
|
|
||||||
Or block specific paths:
|
|
||||||
</p>
|
|
||||||
|
|
||||||
<pre><code>User-agent: TroveBot
|
|
||||||
Disallow: /private/
|
|
||||||
Disallow: /admin/</code></pre>
|
|
||||||
|
|
||||||
<h2>Contact & source</h2>
|
|
||||||
|
|
||||||
<ul>
|
|
||||||
<li>
|
|
||||||
Issues, questions, abuse reports:
|
|
||||||
<a href="https://forge.lvl0.xyz/lvl0/trove/issues">forge.lvl0.xyz/lvl0/trove/issues</a>
|
|
||||||
</li>
|
|
||||||
<li>
|
|
||||||
Source code:
|
|
||||||
<a href="https://forge.lvl0.xyz/lvl0/trove">forge.lvl0.xyz/lvl0/trove</a>
|
|
||||||
</li>
|
|
||||||
</ul>
|
|
||||||
</main>
|
|
||||||
</x-layout>
|
|
||||||
|
|
@ -1,18 +0,0 @@
|
||||||
<!DOCTYPE html>
|
|
||||||
<html lang="{{ str_replace('_', '-', app()->getLocale()) }}">
|
|
||||||
<head>
|
|
||||||
<meta charset="utf-8">
|
|
||||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
|
||||||
|
|
||||||
<title>{{ $title ?? config('app.name') }}</title>
|
|
||||||
|
|
||||||
@vite(['resources/css/app.css', 'resources/js/app.js'])
|
|
||||||
|
|
||||||
@livewireStyles
|
|
||||||
</head>
|
|
||||||
<body>
|
|
||||||
{{ $slot }}
|
|
||||||
|
|
||||||
@livewireScripts
|
|
||||||
</body>
|
|
||||||
</html>
|
|
||||||
|
|
@ -1,14 +0,0 @@
|
||||||
<div>
|
|
||||||
@error('rate_limit') <p>{{ $message }}</p> @enderror
|
|
||||||
|
|
||||||
@if ($confirmedUrl !== null)
|
|
||||||
<p>Thanks, we've received <strong>{{ $confirmedUrl }}</strong></p>
|
|
||||||
@else
|
|
||||||
<form wire:submit="submit">
|
|
||||||
<label for="url">URL</label>
|
|
||||||
<input id="url" type="url" wire:model="url" required>
|
|
||||||
@error('url') <p>{{ $message }}</p> @enderror
|
|
||||||
<button type="submit">Submit</button>
|
|
||||||
</form>
|
|
||||||
@endif
|
|
||||||
</div>
|
|
||||||
|
|
@ -1,3 +0,0 @@
|
||||||
<x-layout>
|
|
||||||
<livewire:url-submission-form />
|
|
||||||
</x-layout>
|
|
||||||
|
|
@ -1,13 +1,7 @@
|
||||||
<?php
|
<?php
|
||||||
|
|
||||||
declare(strict_types=1);
|
|
||||||
|
|
||||||
use Illuminate\Support\Facades\Route;
|
use Illuminate\Support\Facades\Route;
|
||||||
|
|
||||||
Route::get('/', function () {
|
Route::get('/', function () {
|
||||||
return view('welcome');
|
return view('welcome');
|
||||||
});
|
});
|
||||||
|
|
||||||
Route::view('/submit', 'urls.submit');
|
|
||||||
|
|
||||||
Route::view('/bot', 'bot');
|
|
||||||
|
|
|
||||||
|
|
@ -92,10 +92,6 @@ pkgs.mkShell {
|
||||||
podman-compose -f $COMPOSE_FILE exec app php artisan "$@"
|
podman-compose -f $COMPOSE_FILE exec app php artisan "$@"
|
||||||
}
|
}
|
||||||
|
|
||||||
dev-composer() {
|
|
||||||
podman-compose -f $COMPOSE_FILE exec app composer "$@"
|
|
||||||
}
|
|
||||||
|
|
||||||
# ===================
|
# ===================
|
||||||
# BUILD COMMANDS
|
# BUILD COMMANDS
|
||||||
# ===================
|
# ===================
|
||||||
|
|
@ -145,7 +141,6 @@ pkgs.mkShell {
|
||||||
echo " dev-logs-redis Tail Redis logs"
|
echo " dev-logs-redis Tail Redis logs"
|
||||||
echo " dev-shell Shell into app container"
|
echo " dev-shell Shell into app container"
|
||||||
echo " dev-artisan <cmd> Run artisan command"
|
echo " dev-artisan <cmd> Run artisan command"
|
||||||
echo " dev-composer <cmd> Run composer command"
|
|
||||||
echo " base-build Build and push image"
|
echo " base-build Build and push image"
|
||||||
echo ""
|
echo ""
|
||||||
echo "Services:"
|
echo "Services:"
|
||||||
|
|
|
||||||
|
|
@ -1,330 +0,0 @@
|
||||||
<?php
|
|
||||||
|
|
||||||
declare(strict_types=1);
|
|
||||||
|
|
||||||
namespace Tests\Feature\Actions;
|
|
||||||
|
|
||||||
use App\Actions\FetchPageAction;
|
|
||||||
use App\Enums\CrawlOutcomeEnum;
|
|
||||||
use App\ValueObjects\FetchResult;
|
|
||||||
use GuzzleHttp\Exception\ConnectException;
|
|
||||||
use GuzzleHttp\Psr7\Request;
|
|
||||||
use Illuminate\Support\Collection;
|
|
||||||
use Illuminate\Support\Facades\Http;
|
|
||||||
use Tests\TestCase;
|
|
||||||
|
|
||||||
class FetchPageActionTest extends TestCase
|
|
||||||
{
|
|
||||||
public function test_successful_html_fetch_returns_success_outcome(): void
|
|
||||||
{
|
|
||||||
Http::fake([
|
|
||||||
'example.com/*' => Http::response(
|
|
||||||
'<html><body>Hello</body></html>',
|
|
||||||
200,
|
|
||||||
['Content-Type' => 'text/html'],
|
|
||||||
),
|
|
||||||
]);
|
|
||||||
|
|
||||||
$result = $this->makeAction()('https://example.com/page');
|
|
||||||
|
|
||||||
$this->assertInstanceOf(FetchResult::class, $result);
|
|
||||||
$this->assertSame(CrawlOutcomeEnum::Success, $result->outcome);
|
|
||||||
$this->assertSame(200, $result->statusCode);
|
|
||||||
$this->assertNotNull($result->finalUrl);
|
|
||||||
}
|
|
||||||
|
|
||||||
public function test_4xx_response_returns_blocked_4xx(): void
|
|
||||||
{
|
|
||||||
Http::fake([
|
|
||||||
'example.com/*' => Http::response('Not Found', 404),
|
|
||||||
]);
|
|
||||||
|
|
||||||
$result = $this->makeAction()('https://example.com/missing');
|
|
||||||
|
|
||||||
$this->assertInstanceOf(FetchResult::class, $result);
|
|
||||||
$this->assertSame(CrawlOutcomeEnum::Blocked4xx, $result->outcome);
|
|
||||||
$this->assertSame(404, $result->statusCode);
|
|
||||||
$this->assertIsString($result->errorMessage);
|
|
||||||
$this->assertStringContainsString('404', $result->errorMessage);
|
|
||||||
$this->assertNotNull($result->finalUrl);
|
|
||||||
}
|
|
||||||
|
|
||||||
public function test_5xx_response_returns_blocked_5xx(): void
|
|
||||||
{
|
|
||||||
Http::fake([
|
|
||||||
'example.com/*' => Http::response('Service Unavailable', 503),
|
|
||||||
]);
|
|
||||||
|
|
||||||
$result = $this->makeAction()('https://example.com/page');
|
|
||||||
|
|
||||||
$this->assertInstanceOf(FetchResult::class, $result);
|
|
||||||
$this->assertSame(CrawlOutcomeEnum::Blocked5xx, $result->outcome);
|
|
||||||
$this->assertSame(503, $result->statusCode);
|
|
||||||
$this->assertIsString($result->errorMessage);
|
|
||||||
$this->assertStringContainsString('503', $result->errorMessage);
|
|
||||||
$this->assertNotNull($result->finalUrl);
|
|
||||||
}
|
|
||||||
|
|
||||||
public function test_non_html_content_type_returns_rejected(): void
|
|
||||||
{
|
|
||||||
Http::fake([
|
|
||||||
'example.com/*' => Http::response(
|
|
||||||
'PDF binary stuff',
|
|
||||||
200,
|
|
||||||
['Content-Type' => 'application/pdf'],
|
|
||||||
),
|
|
||||||
]);
|
|
||||||
|
|
||||||
$result = $this->makeAction()('https://example.com/document.pdf');
|
|
||||||
|
|
||||||
$this->assertInstanceOf(FetchResult::class, $result);
|
|
||||||
$this->assertSame(CrawlOutcomeEnum::Rejected, $result->outcome);
|
|
||||||
$this->assertSame(200, $result->statusCode);
|
|
||||||
$this->assertIsString($result->errorMessage);
|
|
||||||
$this->assertStringContainsString('application/pdf', $result->errorMessage);
|
|
||||||
$this->assertNotNull($result->finalUrl);
|
|
||||||
$this->assertNull($result->title);
|
|
||||||
$this->assertNull($result->extractedText);
|
|
||||||
$this->assertEmpty($result->outboundLinks);
|
|
||||||
$this->assertNull($result->wordCount);
|
|
||||||
}
|
|
||||||
|
|
||||||
public function test_text_html_with_charset_is_accepted(): void
|
|
||||||
{
|
|
||||||
Http::fake([
|
|
||||||
'example.com/*' => Http::response(
|
|
||||||
'<html><body>Hello charset world</body></html>',
|
|
||||||
200,
|
|
||||||
['Content-Type' => 'text/html; charset=utf-8'],
|
|
||||||
),
|
|
||||||
]);
|
|
||||||
|
|
||||||
$result = $this->makeAction()('https://example.com/page');
|
|
||||||
|
|
||||||
$this->assertInstanceOf(FetchResult::class, $result);
|
|
||||||
$this->assertSame(CrawlOutcomeEnum::Success, $result->outcome);
|
|
||||||
$this->assertSame(200, $result->statusCode);
|
|
||||||
}
|
|
||||||
|
|
||||||
public function test_connection_failure_returns_failed(): void
|
|
||||||
{
|
|
||||||
Http::fake(function () {
|
|
||||||
throw new ConnectException(
|
|
||||||
'Could not resolve host',
|
|
||||||
new Request('GET', 'https://example.com/page'),
|
|
||||||
null,
|
|
||||||
['errno' => 6],
|
|
||||||
);
|
|
||||||
});
|
|
||||||
|
|
||||||
$result = $this->makeAction()('https://example.com/page');
|
|
||||||
|
|
||||||
$this->assertInstanceOf(FetchResult::class, $result);
|
|
||||||
$this->assertSame(CrawlOutcomeEnum::Failed, $result->outcome);
|
|
||||||
$this->assertNull($result->statusCode);
|
|
||||||
$this->assertNull($result->finalUrl);
|
|
||||||
$this->assertIsString($result->errorMessage);
|
|
||||||
$this->assertNull($result->title);
|
|
||||||
$this->assertNull($result->extractedText);
|
|
||||||
$this->assertEmpty($result->outboundLinks);
|
|
||||||
$this->assertNull($result->wordCount);
|
|
||||||
}
|
|
||||||
|
|
||||||
public function test_timeout_returns_timeout(): void
|
|
||||||
{
|
|
||||||
Http::fake(function () {
|
|
||||||
throw new ConnectException(
|
|
||||||
'cURL error 28: Operation timed out',
|
|
||||||
new Request('GET', 'https://example.com/page'),
|
|
||||||
null,
|
|
||||||
['errno' => 28],
|
|
||||||
);
|
|
||||||
});
|
|
||||||
|
|
||||||
$result = $this->makeAction()('https://example.com/page');
|
|
||||||
|
|
||||||
$this->assertInstanceOf(FetchResult::class, $result);
|
|
||||||
$this->assertSame(CrawlOutcomeEnum::Timeout, $result->outcome);
|
|
||||||
$this->assertNull($result->statusCode);
|
|
||||||
$this->assertNull($result->finalUrl);
|
|
||||||
$this->assertIsString($result->errorMessage);
|
|
||||||
}
|
|
||||||
|
|
||||||
public function test_success_extracts_title_from_html(): void
|
|
||||||
{
|
|
||||||
Http::fake([
|
|
||||||
'example.com/*' => Http::response(
|
|
||||||
'<html><head><title>My Page Title</title></head><body><p>Some content.</p></body></html>',
|
|
||||||
200,
|
|
||||||
['Content-Type' => 'text/html'],
|
|
||||||
),
|
|
||||||
]);
|
|
||||||
|
|
||||||
$result = $this->makeAction()('https://example.com/page');
|
|
||||||
|
|
||||||
$this->assertSame(CrawlOutcomeEnum::Success, $result->outcome);
|
|
||||||
$this->assertSame('My Page Title', $result->title);
|
|
||||||
}
|
|
||||||
|
|
||||||
public function test_success_extracts_main_text(): void
|
|
||||||
{
|
|
||||||
$html = <<<'HTML'
|
|
||||||
<!DOCTYPE html>
|
|
||||||
<html>
|
|
||||||
<head><title>Article Title</title></head>
|
|
||||||
<body>
|
|
||||||
<nav>Navigation links</nav>
|
|
||||||
<article>
|
|
||||||
<h1>The Real Article</h1>
|
|
||||||
<p>This is the main article body that should be extracted by readability.</p>
|
|
||||||
<p>Multiple paragraphs prove the extractor works on the full content.</p>
|
|
||||||
</article>
|
|
||||||
<footer>Site footer noise</footer>
|
|
||||||
</body>
|
|
||||||
</html>
|
|
||||||
HTML;
|
|
||||||
|
|
||||||
Http::fake([
|
|
||||||
'example.com/*' => Http::response($html, 200, ['Content-Type' => 'text/html']),
|
|
||||||
]);
|
|
||||||
|
|
||||||
$result = $this->makeAction()('https://example.com/article');
|
|
||||||
|
|
||||||
$this->assertSame(CrawlOutcomeEnum::Success, $result->outcome);
|
|
||||||
$this->assertNotNull($result->extractedText);
|
|
||||||
$this->assertStringContainsString('main article body', $result->extractedText);
|
|
||||||
}
|
|
||||||
|
|
||||||
public function test_success_extracts_and_filters_outbound_links(): void
|
|
||||||
{
|
|
||||||
$html = <<<'HTML'
|
|
||||||
<!DOCTYPE html>
|
|
||||||
<html>
|
|
||||||
<head><title>Article With Links</title></head>
|
|
||||||
<body>
|
|
||||||
<nav>
|
|
||||||
<a href="/home">Home (nav, should be filtered out by Readability scope)</a>
|
|
||||||
</nav>
|
|
||||||
<article>
|
|
||||||
<h1>Article Title</h1>
|
|
||||||
<p>This article references <a href="https://other.com/article">an external article</a>.</p>
|
|
||||||
<p>And a <a href="/related-post">relative link to a related post</a> on the same site.</p>
|
|
||||||
<p>Plus a <a href="http://192.168.1.1/admin">private IP link</a> that should be rejected.</p>
|
|
||||||
<p>And a <a href="https://user:pass@evil.com/">credentials URL</a> that should be rejected.</p>
|
|
||||||
<p>And a <a href="ftp://files.example.com/">non-http scheme</a> that should be rejected.</p>
|
|
||||||
</article>
|
|
||||||
<footer>
|
|
||||||
<a href="/privacy">Privacy (footer, filtered by Readability scope)</a>
|
|
||||||
</footer>
|
|
||||||
</body>
|
|
||||||
</html>
|
|
||||||
HTML;
|
|
||||||
|
|
||||||
Http::fake([
|
|
||||||
'example.com/*' => Http::response($html, 200, ['Content-Type' => 'text/html']),
|
|
||||||
]);
|
|
||||||
|
|
||||||
$result = $this->makeAction()('https://example.com/article');
|
|
||||||
|
|
||||||
$this->assertSame(CrawlOutcomeEnum::Success, $result->outcome);
|
|
||||||
$this->assertInstanceOf(Collection::class, $result->outboundLinks);
|
|
||||||
$this->assertSame(2, $result->outboundLinks->count());
|
|
||||||
$this->assertContains('https://other.com/article', $result->outboundLinks->all());
|
|
||||||
$this->assertContains('https://example.com/related-post', $result->outboundLinks->all());
|
|
||||||
$this->assertNotContains('http://192.168.1.1/admin', $result->outboundLinks->all());
|
|
||||||
$this->assertNotContains('https://user:pass@evil.com/', $result->outboundLinks->all());
|
|
||||||
$this->assertNotContains('ftp://files.example.com/', $result->outboundLinks->all());
|
|
||||||
}
|
|
||||||
|
|
||||||
public function test_success_calculates_word_count(): void
|
|
||||||
{
|
|
||||||
$html = <<<'HTML'
|
|
||||||
<!DOCTYPE html>
|
|
||||||
<html>
|
|
||||||
<head><title>Word Count Test</title></head>
|
|
||||||
<body>
|
|
||||||
<article>
|
|
||||||
<p>This article body has exactly nine words total here.</p>
|
|
||||||
</article>
|
|
||||||
</body>
|
|
||||||
</html>
|
|
||||||
HTML;
|
|
||||||
|
|
||||||
Http::fake([
|
|
||||||
'example.com/*' => Http::response($html, 200, ['Content-Type' => 'text/html']),
|
|
||||||
]);
|
|
||||||
|
|
||||||
$result = $this->makeAction()('https://example.com/article');
|
|
||||||
|
|
||||||
$this->assertSame(CrawlOutcomeEnum::Success, $result->outcome);
|
|
||||||
$this->assertSame(9, $result->wordCount);
|
|
||||||
}
|
|
||||||
|
|
||||||
public function test_uppercase_content_type_is_accepted_as_html(): void
|
|
||||||
{
|
|
||||||
Http::fake([
|
|
||||||
'example.com/*' => Http::response(
|
|
||||||
'<html><head><title>Uppercase CT</title></head><body><p>Content here.</p></body></html>',
|
|
||||||
200,
|
|
||||||
['Content-Type' => 'Text/HTML; charset=utf-8'],
|
|
||||||
),
|
|
||||||
]);
|
|
||||||
|
|
||||||
$result = $this->makeAction()('https://example.com/page');
|
|
||||||
|
|
||||||
$this->assertSame(CrawlOutcomeEnum::Success, $result->outcome);
|
|
||||||
}
|
|
||||||
|
|
||||||
public function test_empty_href_is_filtered_from_outbound_links(): void
|
|
||||||
{
|
|
||||||
$html = <<<'HTML'
|
|
||||||
<!DOCTYPE html>
|
|
||||||
<html>
|
|
||||||
<head><title>Empty Href Test</title></head>
|
|
||||||
<body>
|
|
||||||
<article>
|
|
||||||
<p>This paragraph has <a href="">an empty href anchor</a> that should be dropped.</p>
|
|
||||||
</article>
|
|
||||||
</body>
|
|
||||||
</html>
|
|
||||||
HTML;
|
|
||||||
|
|
||||||
Http::fake([
|
|
||||||
'example.com/*' => Http::response($html, 200, ['Content-Type' => 'text/html']),
|
|
||||||
]);
|
|
||||||
|
|
||||||
$result = $this->makeAction()('https://example.com/article');
|
|
||||||
|
|
||||||
$this->assertSame(CrawlOutcomeEnum::Success, $result->outcome);
|
|
||||||
$this->assertSame(0, $result->outboundLinks->count());
|
|
||||||
}
|
|
||||||
|
|
||||||
public function test_fragment_only_href_is_filtered_from_outbound_links(): void
|
|
||||||
{
|
|
||||||
$html = <<<'HTML'
|
|
||||||
<!DOCTYPE html>
|
|
||||||
<html>
|
|
||||||
<head><title>Fragment Href Test</title></head>
|
|
||||||
<body>
|
|
||||||
<article>
|
|
||||||
<p>Jump to <a href="#section-2">section 2</a> of this page.</p>
|
|
||||||
</article>
|
|
||||||
</body>
|
|
||||||
</html>
|
|
||||||
HTML;
|
|
||||||
|
|
||||||
Http::fake([
|
|
||||||
'example.com/*' => Http::response($html, 200, ['Content-Type' => 'text/html']),
|
|
||||||
]);
|
|
||||||
|
|
||||||
$result = $this->makeAction()('https://example.com/article');
|
|
||||||
|
|
||||||
$this->assertSame(CrawlOutcomeEnum::Success, $result->outcome);
|
|
||||||
$this->assertSame(0, $result->outboundLinks->count());
|
|
||||||
}
|
|
||||||
|
|
||||||
private function makeAction(): FetchPageAction
|
|
||||||
{
|
|
||||||
return app(FetchPageAction::class);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
@ -1,39 +0,0 @@
|
||||||
<?php
|
|
||||||
|
|
||||||
declare(strict_types=1);
|
|
||||||
|
|
||||||
namespace Tests\Feature;
|
|
||||||
|
|
||||||
use Tests\TestCase;
|
|
||||||
|
|
||||||
class BotPageTest extends TestCase
|
|
||||||
{
|
|
||||||
public function test_bot_page_renders_at_public_route(): void
|
|
||||||
{
|
|
||||||
$response = $this->get('/bot');
|
|
||||||
|
|
||||||
$response->assertStatus(200);
|
|
||||||
}
|
|
||||||
|
|
||||||
public function test_bot_page_contains_user_agent_string(): void
|
|
||||||
{
|
|
||||||
$response = $this->get('/bot');
|
|
||||||
|
|
||||||
$response->assertSee('TroveBot/0.1 (+https://trove.lvl0.xyz/bot)', escape: false);
|
|
||||||
}
|
|
||||||
|
|
||||||
public function test_bot_page_contains_robots_txt_opt_out_example(): void
|
|
||||||
{
|
|
||||||
$response = $this->get('/bot');
|
|
||||||
|
|
||||||
$response->assertSee('User-agent: TroveBot', escape: false);
|
|
||||||
$response->assertSee('Disallow: /', escape: false);
|
|
||||||
}
|
|
||||||
|
|
||||||
public function test_bot_page_links_to_forge_repository(): void
|
|
||||||
{
|
|
||||||
$response = $this->get('/bot');
|
|
||||||
|
|
||||||
$response->assertSee('https://forge.lvl0.xyz/lvl0/trove', escape: false);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
@ -1,393 +0,0 @@
|
||||||
<?php
|
|
||||||
|
|
||||||
declare(strict_types=1);
|
|
||||||
|
|
||||||
namespace Tests\Feature\Jobs;
|
|
||||||
|
|
||||||
use App\Actions\FetchPageAction;
|
|
||||||
use App\Enums\CrawlOutcomeEnum;
|
|
||||||
use App\Enums\PageStatusEnum;
|
|
||||||
use App\Jobs\ProcessCrawlJob;
|
|
||||||
use App\Models\Page;
|
|
||||||
use App\Models\PageCrawl;
|
|
||||||
use App\ValueObjects\FetchResult;
|
|
||||||
use Carbon\Carbon;
|
|
||||||
use Illuminate\Foundation\Testing\RefreshDatabase;
|
|
||||||
use Illuminate\Support\Collection;
|
|
||||||
use Illuminate\Support\Facades\Cache;
|
|
||||||
use Illuminate\Support\Facades\Queue;
|
|
||||||
use Mockery;
|
|
||||||
use Tests\TestCase;
|
|
||||||
|
|
||||||
class ProcessCrawlJobTest extends TestCase
|
|
||||||
{
|
|
||||||
use RefreshDatabase;
|
|
||||||
|
|
||||||
public function test_creating_a_page_crawl_dispatches_process_crawl_job(): void
|
|
||||||
{
|
|
||||||
Queue::fake();
|
|
||||||
|
|
||||||
$page = Page::factory()->createQuietly(['url' => 'https://example.com/article']);
|
|
||||||
PageCrawl::factory()->page($page)->create();
|
|
||||||
|
|
||||||
Queue::assertPushed(ProcessCrawlJob::class);
|
|
||||||
}
|
|
||||||
|
|
||||||
public function test_dispatched_job_carries_the_correct_page_crawl(): void
|
|
||||||
{
|
|
||||||
Queue::fake();
|
|
||||||
|
|
||||||
$page = Page::factory()->createQuietly(['url' => 'https://example.com/article']);
|
|
||||||
$crawl = PageCrawl::factory()->page($page)->create();
|
|
||||||
|
|
||||||
Queue::assertPushed(
|
|
||||||
ProcessCrawlJob::class,
|
|
||||||
fn (ProcessCrawlJob $job) => $job->pageCrawl->id === $crawl->id,
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
public function test_handle_writes_outcome_to_page_crawl_on_success(): void
|
|
||||||
{
|
|
||||||
Queue::fake();
|
|
||||||
|
|
||||||
$this->mockFetchPageAction(CrawlOutcomeEnum::Success, statusCode: 200, title: 'Hello', extractedText: 'hi', wordCount: 1);
|
|
||||||
|
|
||||||
$page = Page::factory()->createQuietly(['url' => 'https://example.com/article']);
|
|
||||||
$crawl = PageCrawl::factory()->page($page)->createQuietly();
|
|
||||||
|
|
||||||
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
|
|
||||||
->handle();
|
|
||||||
|
|
||||||
$fresh = $crawl->fresh();
|
|
||||||
$this->assertSame(CrawlOutcomeEnum::Success, $fresh->outcome);
|
|
||||||
$this->assertNotNull($fresh->completed_at);
|
|
||||||
$this->assertInstanceOf(Carbon::class, $fresh->completed_at);
|
|
||||||
$this->assertSame(200, $fresh->status_code);
|
|
||||||
$this->assertNull($fresh->error_message);
|
|
||||||
}
|
|
||||||
|
|
||||||
public function test_handle_updates_page_to_fetched_on_success(): void
|
|
||||||
{
|
|
||||||
Queue::fake();
|
|
||||||
|
|
||||||
$this->mockFetchPageAction(CrawlOutcomeEnum::Success, statusCode: 200, title: 'Hello', extractedText: 'hi', wordCount: 1);
|
|
||||||
|
|
||||||
$page = Page::factory()->createQuietly(['url' => 'https://example.com/article']);
|
|
||||||
$crawl = PageCrawl::factory()->page($page)->createQuietly();
|
|
||||||
|
|
||||||
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
|
|
||||||
->handle();
|
|
||||||
|
|
||||||
$fresh = $page->fresh();
|
|
||||||
$this->assertSame(PageStatusEnum::Fetched, $fresh->status);
|
|
||||||
$this->assertNotNull($fresh->fetched_at);
|
|
||||||
$this->assertInstanceOf(Carbon::class, $fresh->fetched_at);
|
|
||||||
$this->assertSame('Hello', $fresh->title);
|
|
||||||
}
|
|
||||||
|
|
||||||
public function test_handle_updates_page_to_rejected_on_rejected_outcome(): void
|
|
||||||
{
|
|
||||||
Queue::fake();
|
|
||||||
|
|
||||||
$this->mockFetchPageAction(CrawlOutcomeEnum::Rejected, statusCode: 200, errorMessage: 'Unsupported Content-Type: application/pdf');
|
|
||||||
|
|
||||||
$page = Page::factory()->createQuietly(['url' => 'https://example.com/brochure.pdf']);
|
|
||||||
$crawl = PageCrawl::factory()->page($page)->createQuietly();
|
|
||||||
|
|
||||||
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
|
|
||||||
->handle();
|
|
||||||
|
|
||||||
$fresh = $page->fresh();
|
|
||||||
$this->assertSame(PageStatusEnum::Rejected, $fresh->status);
|
|
||||||
$this->assertNull($fresh->fetched_at);
|
|
||||||
}
|
|
||||||
|
|
||||||
public function test_handle_updates_page_to_failed_on_blocked_4xx(): void
|
|
||||||
{
|
|
||||||
Queue::fake();
|
|
||||||
|
|
||||||
$this->mockFetchPageAction(CrawlOutcomeEnum::Blocked4xx, statusCode: 404, errorMessage: 'HTTP 404');
|
|
||||||
|
|
||||||
$page = Page::factory()->createQuietly(['url' => 'https://example.com/gone']);
|
|
||||||
$crawl = PageCrawl::factory()->page($page)->createQuietly();
|
|
||||||
|
|
||||||
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
|
|
||||||
->handle();
|
|
||||||
|
|
||||||
$fresh = $page->fresh();
|
|
||||||
$this->assertSame(PageStatusEnum::Failed, $fresh->status);
|
|
||||||
$this->assertNotNull($fresh->failed_at);
|
|
||||||
$this->assertInstanceOf(Carbon::class, $fresh->failed_at);
|
|
||||||
}
|
|
||||||
|
|
||||||
public function test_handle_updates_page_to_failed_on_timeout(): void
|
|
||||||
{
|
|
||||||
Queue::fake();
|
|
||||||
|
|
||||||
$this->mockFetchPageAction(CrawlOutcomeEnum::Timeout, errorMessage: 'Connection timed out after 10 seconds');
|
|
||||||
|
|
||||||
$page = Page::factory()->createQuietly(['url' => 'https://example.com/slow']);
|
|
||||||
$crawl = PageCrawl::factory()->page($page)->createQuietly();
|
|
||||||
|
|
||||||
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
|
|
||||||
->handle();
|
|
||||||
|
|
||||||
$fresh = $page->fresh();
|
|
||||||
$this->assertSame(PageStatusEnum::Failed, $fresh->status);
|
|
||||||
$this->assertNotNull($fresh->failed_at);
|
|
||||||
$this->assertInstanceOf(Carbon::class, $fresh->failed_at);
|
|
||||||
}
|
|
||||||
|
|
||||||
public function test_handle_schedules_retry_on_transient_failure(): void
|
|
||||||
{
|
|
||||||
Queue::fake();
|
|
||||||
|
|
||||||
$this->mockFetchPageAction(CrawlOutcomeEnum::Failed, errorMessage: 'Connection refused');
|
|
||||||
|
|
||||||
$page = Page::factory()->createQuietly(['url' => 'https://example.com/unstable']);
|
|
||||||
$crawl = PageCrawl::factory()->page($page)->createQuietly();
|
|
||||||
|
|
||||||
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
|
|
||||||
->handle();
|
|
||||||
|
|
||||||
// A second PageCrawl row (the retry) must have been inserted for the same page
|
|
||||||
$this->assertSame(2, PageCrawl::where('page_id', $page->id)->count());
|
|
||||||
|
|
||||||
// The new row is pending — outcome IS NULL
|
|
||||||
$retryRow = PageCrawl::where('page_id', $page->id)
|
|
||||||
->whereNull('outcome')
|
|
||||||
->first();
|
|
||||||
$this->assertNotNull($retryRow);
|
|
||||||
|
|
||||||
// A delayed ProcessCrawlJob must have been pushed for the retry row
|
|
||||||
Queue::assertPushed(
|
|
||||||
ProcessCrawlJob::class,
|
|
||||||
fn (ProcessCrawlJob $job) => $job->pageCrawl->page_id === $page->id
|
|
||||||
&& $job->pageCrawl->id === $retryRow->id,
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
public function test_handle_does_not_retry_after_three_attempts(): void
|
|
||||||
{
|
|
||||||
Queue::fake();
|
|
||||||
|
|
||||||
$this->mockFetchPageAction(CrawlOutcomeEnum::Failed, errorMessage: 'Connection refused');
|
|
||||||
|
|
||||||
$page = Page::factory()->createQuietly(['url' => 'https://example.com/unreachable']);
|
|
||||||
|
|
||||||
// 3 prior attempts already exist — this is the cap
|
|
||||||
PageCrawl::factory()->page($page)->failed('Connection refused')->createQuietly();
|
|
||||||
PageCrawl::factory()->page($page)->failed('Connection refused')->createQuietly();
|
|
||||||
$thirdCrawl = PageCrawl::factory()->page($page)->createQuietly();
|
|
||||||
|
|
||||||
app(ProcessCrawlJob::class, ['pageCrawl' => $thirdCrawl])
|
|
||||||
->handle();
|
|
||||||
|
|
||||||
// No 4th row must appear — retry cap reached
|
|
||||||
$this->assertSame(3, PageCrawl::where('page_id', $page->id)->count());
|
|
||||||
|
|
||||||
// No retry job dispatched
|
|
||||||
Queue::assertNotPushed(ProcessCrawlJob::class);
|
|
||||||
}
|
|
||||||
|
|
||||||
public function test_handle_writes_failed_outcome_to_page_crawl(): void
|
|
||||||
{
|
|
||||||
Queue::fake();
|
|
||||||
|
|
||||||
$this->mockFetchPageAction(CrawlOutcomeEnum::Failed, errorMessage: 'boom');
|
|
||||||
|
|
||||||
$page = Page::factory()->createQuietly(['url' => 'https://example.com/unstable']);
|
|
||||||
$crawl = PageCrawl::factory()->page($page)->createQuietly();
|
|
||||||
|
|
||||||
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
|
|
||||||
->handle();
|
|
||||||
|
|
||||||
$this->assertDatabaseHas('page_crawls', [
|
|
||||||
'id' => $crawl->id,
|
|
||||||
'outcome' => CrawlOutcomeEnum::Failed->value,
|
|
||||||
'status_code' => null,
|
|
||||||
'error_message' => 'boom',
|
|
||||||
]);
|
|
||||||
}
|
|
||||||
|
|
||||||
public function test_handle_updates_page_to_failed_on_failed_outcome(): void
|
|
||||||
{
|
|
||||||
Queue::fake();
|
|
||||||
|
|
||||||
$this->mockFetchPageAction(CrawlOutcomeEnum::Failed, errorMessage: 'Connection refused');
|
|
||||||
|
|
||||||
$page = Page::factory()->createQuietly(['url' => 'https://example.com/unreachable']);
|
|
||||||
$crawl = PageCrawl::factory()->page($page)->createQuietly();
|
|
||||||
|
|
||||||
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
|
|
||||||
->handle();
|
|
||||||
|
|
||||||
$this->assertSame(PageStatusEnum::Failed, $page->fresh()->status);
|
|
||||||
}
|
|
||||||
|
|
||||||
public function test_handle_updates_page_to_failed_on_blocked_5xx(): void
|
|
||||||
{
|
|
||||||
Queue::fake();
|
|
||||||
|
|
||||||
$this->mockFetchPageAction(CrawlOutcomeEnum::Blocked5xx, statusCode: 503, errorMessage: 'HTTP 503');
|
|
||||||
|
|
||||||
$page = Page::factory()->createQuietly(['url' => 'https://example.com/overloaded']);
|
|
||||||
$crawl = PageCrawl::factory()->page($page)->createQuietly();
|
|
||||||
|
|
||||||
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
|
|
||||||
->handle();
|
|
||||||
|
|
||||||
$this->assertSame(PageStatusEnum::Failed, $page->fresh()->status);
|
|
||||||
}
|
|
||||||
|
|
||||||
public function test_handle_updates_page_to_failed_on_blocked_robots(): void
|
|
||||||
{
|
|
||||||
Queue::fake();
|
|
||||||
|
|
||||||
$this->mockFetchPageAction(CrawlOutcomeEnum::BlockedRobots, errorMessage: 'Disallowed by robots.txt');
|
|
||||||
|
|
||||||
$page = Page::factory()->createQuietly(['url' => 'https://example.com/private']);
|
|
||||||
$crawl = PageCrawl::factory()->page($page)->createQuietly();
|
|
||||||
|
|
||||||
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
|
|
||||||
->handle();
|
|
||||||
|
|
||||||
$this->assertSame(PageStatusEnum::Failed, $page->fresh()->status);
|
|
||||||
}
|
|
||||||
|
|
||||||
public function test_handle_does_not_register_outbound_links_on_failure(): void
|
|
||||||
{
|
|
||||||
Queue::fake();
|
|
||||||
|
|
||||||
$this->mockFetchPageAction(
|
|
||||||
CrawlOutcomeEnum::Failed,
|
|
||||||
outboundLinks: collect(['https://should-not-be-registered.com/page']),
|
|
||||||
errorMessage: 'Connection refused',
|
|
||||||
);
|
|
||||||
|
|
||||||
$page = Page::factory()->createQuietly(['url' => 'https://example.com/broken']);
|
|
||||||
$crawl = PageCrawl::factory()->page($page)->createQuietly();
|
|
||||||
|
|
||||||
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
|
|
||||||
->handle();
|
|
||||||
|
|
||||||
$this->assertDatabaseMissing('pages', ['url' => 'https://should-not-be-registered.com/page']);
|
|
||||||
$this->assertSame(1, Page::count());
|
|
||||||
}
|
|
||||||
|
|
||||||
public function test_handle_registers_outbound_links_on_success(): void
|
|
||||||
{
|
|
||||||
Queue::fake();
|
|
||||||
|
|
||||||
$this->mockFetchPageAction(
|
|
||||||
CrawlOutcomeEnum::Success,
|
|
||||||
statusCode: 200,
|
|
||||||
finalUrl: 'https://source.com/article',
|
|
||||||
title: 'Source Article',
|
|
||||||
extractedText: 'some text',
|
|
||||||
outboundLinks: collect(['https://other.com/article-1', 'https://another.com/post-2']),
|
|
||||||
wordCount: 2,
|
|
||||||
);
|
|
||||||
|
|
||||||
$page = Page::factory()->createQuietly(['url' => 'https://source.com/article']);
|
|
||||||
$crawl = PageCrawl::factory()->page($page)->createQuietly();
|
|
||||||
|
|
||||||
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
|
|
||||||
->handle();
|
|
||||||
|
|
||||||
$this->assertDatabaseHas('pages', ['url' => 'https://other.com/article-1']);
|
|
||||||
$this->assertDatabaseHas('pages', ['url' => 'https://another.com/post-2']);
|
|
||||||
$this->assertSame(3, Page::count());
|
|
||||||
}
|
|
||||||
|
|
||||||
public function test_handle_releases_job_when_domain_is_locked(): void
|
|
||||||
{
|
|
||||||
Queue::fake();
|
|
||||||
|
|
||||||
// Pre-acquire the lock so the job sees it as already held
|
|
||||||
Cache::lock('crawler:domain:example.com', 10)->get();
|
|
||||||
|
|
||||||
// The fetcher must NOT be called — the job should bail before reaching it
|
|
||||||
$fetcher = Mockery::mock(FetchPageAction::class);
|
|
||||||
$fetcher->shouldNotReceive('__invoke');
|
|
||||||
$this->app->instance(FetchPageAction::class, $fetcher);
|
|
||||||
|
|
||||||
$page = Page::factory()->createQuietly(['url' => 'https://example.com/article']);
|
|
||||||
$crawl = PageCrawl::factory()->page($page)->createQuietly();
|
|
||||||
|
|
||||||
$job = new ProcessCrawlJob($crawl);
|
|
||||||
$job->handle();
|
|
||||||
|
|
||||||
// No outcome written — handle() returned early
|
|
||||||
$this->assertNull($crawl->fresh()->outcome);
|
|
||||||
|
|
||||||
// Page status unchanged from its factory default (Discovered)
|
|
||||||
$this->assertSame(PageStatusEnum::Discovered, $page->fresh()->status);
|
|
||||||
}
|
|
||||||
|
|
||||||
public function test_handle_does_not_release_lock_after_completion(): void
|
|
||||||
{
|
|
||||||
Queue::fake();
|
|
||||||
|
|
||||||
$this->mockFetchPageAction(CrawlOutcomeEnum::Success, statusCode: 200);
|
|
||||||
|
|
||||||
$page = Page::factory()->createQuietly(['url' => 'https://example.com/article']);
|
|
||||||
$crawl = PageCrawl::factory()->page($page)->createQuietly();
|
|
||||||
|
|
||||||
$job = new ProcessCrawlJob($crawl);
|
|
||||||
$job->handle();
|
|
||||||
|
|
||||||
// If handle() called $lock->release(), this second get() would succeed (true).
|
|
||||||
// It must fail (false) — the lock acquired inside handle() must still be held.
|
|
||||||
$result = Cache::lock('crawler:domain:example.com', 10)->get();
|
|
||||||
$this->assertFalse($result, 'Expected the domain lock to still be held after handle() completed, but it was free — the lock was released prematurely.');
|
|
||||||
}
|
|
||||||
|
|
||||||
public function test_handle_acquires_domain_lock_before_fetching(): void
|
|
||||||
{
|
|
||||||
Queue::fake();
|
|
||||||
|
|
||||||
$this->mockFetchPageAction(CrawlOutcomeEnum::Success, statusCode: 200);
|
|
||||||
|
|
||||||
$page = Page::factory()->createQuietly(['url' => 'https://lock-test.example.com/article']);
|
|
||||||
$crawl = PageCrawl::factory()->page($page)->createQuietly();
|
|
||||||
|
|
||||||
$domain = $crawl->domain;
|
|
||||||
|
|
||||||
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
|
|
||||||
->handle();
|
|
||||||
|
|
||||||
// The lock must still be held after handle() completes — a second attempt to acquire it fails
|
|
||||||
$this->assertFalse(
|
|
||||||
Cache::lock("crawler:domain:{$domain}", 10)->get(),
|
|
||||||
'Expected the domain lock to still be held after handle() ran, but it was free.',
|
|
||||||
);
|
|
||||||
|
|
||||||
// The fetch ran — outcome was written (proves the lock did not block execution)
|
|
||||||
$this->assertSame(CrawlOutcomeEnum::Success, $crawl->fresh()->outcome);
|
|
||||||
}
|
|
||||||
|
|
||||||
private function mockFetchPageAction(
|
|
||||||
CrawlOutcomeEnum $outcome,
|
|
||||||
?int $statusCode = null,
|
|
||||||
?string $finalUrl = 'https://example.com/article',
|
|
||||||
?string $title = null,
|
|
||||||
?string $extractedText = null,
|
|
||||||
?Collection $outboundLinks = null,
|
|
||||||
?int $wordCount = null,
|
|
||||||
?string $errorMessage = null,
|
|
||||||
): void {
|
|
||||||
$fetcher = Mockery::mock(FetchPageAction::class);
|
|
||||||
$fetcher->shouldReceive('__invoke')->andReturn(new FetchResult(
|
|
||||||
outcome: $outcome,
|
|
||||||
statusCode: $statusCode,
|
|
||||||
finalUrl: $finalUrl,
|
|
||||||
title: $title,
|
|
||||||
extractedText: $extractedText,
|
|
||||||
outboundLinks: $outboundLinks ?? collect(),
|
|
||||||
wordCount: $wordCount,
|
|
||||||
errorMessage: $errorMessage,
|
|
||||||
));
|
|
||||||
$this->app->instance(FetchPageAction::class, $fetcher);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
@ -1,70 +0,0 @@
|
||||||
<?php
|
|
||||||
|
|
||||||
declare(strict_types=1);
|
|
||||||
|
|
||||||
namespace Tests\Feature;
|
|
||||||
|
|
||||||
use App\Models\Page;
|
|
||||||
use App\Models\PageCrawl;
|
|
||||||
use App\Services\UrlService;
|
|
||||||
use Illuminate\Foundation\Testing\RefreshDatabase;
|
|
||||||
use Tests\TestCase;
|
|
||||||
|
|
||||||
class PageQueuePopulationTest extends TestCase
|
|
||||||
{
|
|
||||||
use RefreshDatabase;
|
|
||||||
|
|
||||||
public function test_creating_a_page_inserts_a_page_crawl_row(): void
|
|
||||||
{
|
|
||||||
$url = 'https://example-blog.com/article';
|
|
||||||
|
|
||||||
$page = Page::factory()->create(['url' => $url]);
|
|
||||||
|
|
||||||
$expectedDomain = (new UrlService)->host($url);
|
|
||||||
|
|
||||||
$this->assertDatabaseHas('page_crawls', [
|
|
||||||
'page_id' => $page->id,
|
|
||||||
'domain' => $expectedDomain,
|
|
||||||
'priority' => 0,
|
|
||||||
]);
|
|
||||||
|
|
||||||
$crawl = PageCrawl::where('page_id', $page->id)->first();
|
|
||||||
$this->assertNotNull($crawl);
|
|
||||||
}
|
|
||||||
|
|
||||||
public function test_first_or_create_with_existing_url_does_not_insert_duplicate_crawl(): void
|
|
||||||
{
|
|
||||||
$url = 'https://example-blog.com/article';
|
|
||||||
|
|
||||||
Page::factory()->create(['url' => $url]);
|
|
||||||
|
|
||||||
// Finds the existing row — created event does not fire again
|
|
||||||
Page::firstOrCreate(['url' => $url], ['status' => 'discovered']);
|
|
||||||
|
|
||||||
$this->assertDatabaseCount('page_crawls', 1);
|
|
||||||
}
|
|
||||||
|
|
||||||
public function test_updating_a_page_does_not_insert_another_crawl(): void
|
|
||||||
{
|
|
||||||
$page = Page::factory()->create(['url' => 'https://example-blog.com/article']);
|
|
||||||
|
|
||||||
$page->update(['title' => 'New Title']);
|
|
||||||
|
|
||||||
$this->assertDatabaseCount('page_crawls', 1);
|
|
||||||
}
|
|
||||||
|
|
||||||
public function test_bad_url_throws_exception_page_persists_no_crawl_inserted(): void
|
|
||||||
{
|
|
||||||
$caught = null;
|
|
||||||
|
|
||||||
try {
|
|
||||||
Page::create(['url' => 'not-a-url', 'status' => 'discovered']);
|
|
||||||
} catch (\InvalidArgumentException $e) {
|
|
||||||
$caught = $e;
|
|
||||||
}
|
|
||||||
|
|
||||||
$this->assertNotNull($caught, 'Expected InvalidArgumentException to be thrown');
|
|
||||||
$this->assertDatabaseHas('pages', ['url' => 'not-a-url']);
|
|
||||||
$this->assertDatabaseCount('page_crawls', 0);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
@ -4,6 +4,7 @@
|
||||||
|
|
||||||
namespace Tests\Feature;
|
namespace Tests\Feature;
|
||||||
|
|
||||||
|
use App\Enums\PageStatusEnum;
|
||||||
use App\Listeners\UrlDiscoveredListener;
|
use App\Listeners\UrlDiscoveredListener;
|
||||||
use App\Models\Page;
|
use App\Models\Page;
|
||||||
use App\Models\PageLink;
|
use App\Models\PageLink;
|
||||||
|
|
@ -65,10 +66,15 @@ public function test_listener_creates_target_page_and_source_page_with_link(): v
|
||||||
// Target page
|
// Target page
|
||||||
$targetPage = Page::where('url', 'https://example-blog.com/article')->first();
|
$targetPage = Page::where('url', 'https://example-blog.com/article')->first();
|
||||||
$this->assertNotNull($targetPage);
|
$this->assertNotNull($targetPage);
|
||||||
|
$this->assertSame(PageStatusEnum::Discovered, $targetPage->status);
|
||||||
|
$this->assertSame($instance->id, $targetPage->instance_id);
|
||||||
|
|
||||||
// Source page
|
// Source page
|
||||||
$sourcePage = Page::where('url', 'https://mastodon.social/@alice/109876543210')->first();
|
$sourcePage = Page::where('url', 'https://mastodon.social/@alice/109876543210')->first();
|
||||||
$this->assertNotNull($sourcePage);
|
$this->assertNotNull($sourcePage);
|
||||||
|
$this->assertSame(PageStatusEnum::Discovered, $sourcePage->status);
|
||||||
|
$this->assertSame($instance->id, $sourcePage->instance_id);
|
||||||
|
$this->assertNull($sourcePage->fetched_at);
|
||||||
|
|
||||||
// Edge
|
// Edge
|
||||||
$link = PageLink::where('source_page_id', $sourcePage->id)
|
$link = PageLink::where('source_page_id', $sourcePage->id)
|
||||||
|
|
@ -109,30 +115,8 @@ public function test_listener_with_null_post_url_creates_only_target_page(): voi
|
||||||
|
|
||||||
$targetPage = Page::where('url', 'https://example-blog.com/article')->first();
|
$targetPage = Page::where('url', 'https://example-blog.com/article')->first();
|
||||||
$this->assertNotNull($targetPage);
|
$this->assertNotNull($targetPage);
|
||||||
}
|
$this->assertSame(PageStatusEnum::Discovered, $targetPage->status);
|
||||||
|
$this->assertSame($instance->id, $targetPage->instance_id);
|
||||||
// ---------------------------------------------------------------------------
|
|
||||||
// Integration — UrlDiscovered event enqueues crawls for both pages via observer
|
|
||||||
// ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
public function test_url_discovered_event_enqueues_crawls_via_observer(): void
|
|
||||||
{
|
|
||||||
$instance = $this->makeInstance();
|
|
||||||
|
|
||||||
$event = new UrlDiscovered(
|
|
||||||
url: 'https://example-blog.com/article',
|
|
||||||
instanceId: $instance->id,
|
|
||||||
discoveredAt: CarbonImmutable::parse('2026-04-26T12:00:00Z'),
|
|
||||||
postUrl: 'https://mastodon.social/@alice/109876543210',
|
|
||||||
postBody: 'check this out https://example-blog.com/article',
|
|
||||||
);
|
|
||||||
|
|
||||||
event($event);
|
|
||||||
|
|
||||||
// Listener creates 2 pages (target + source); observer fires for each → 2 crawl rows
|
|
||||||
$this->assertDatabaseCount('page_crawls', 2);
|
|
||||||
$this->assertDatabaseHas('page_crawls', ['domain' => 'example-blog.com']);
|
|
||||||
$this->assertDatabaseHas('page_crawls', ['domain' => 'mastodon.social']);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// ---------------------------------------------------------------------------
|
// ---------------------------------------------------------------------------
|
||||||
|
|
|
||||||
|
|
@ -1,158 +0,0 @@
|
||||||
<?php
|
|
||||||
|
|
||||||
declare(strict_types=1);
|
|
||||||
|
|
||||||
namespace Tests\Feature;
|
|
||||||
|
|
||||||
use App\Enums\PageStatusEnum;
|
|
||||||
use App\Livewire\UrlSubmissionForm;
|
|
||||||
use App\Models\Page;
|
|
||||||
use Illuminate\Foundation\Testing\RefreshDatabase;
|
|
||||||
use Livewire\Livewire;
|
|
||||||
use PHPUnit\Framework\Attributes\DataProvider;
|
|
||||||
use Tests\TestCase;
|
|
||||||
|
|
||||||
class UrlSubmissionTest extends TestCase
|
|
||||||
{
|
|
||||||
use RefreshDatabase;
|
|
||||||
|
|
||||||
// -------------------------------------------------------------------------
|
|
||||||
// Test 1 — route renders the submission form
|
|
||||||
// -------------------------------------------------------------------------
|
|
||||||
|
|
||||||
public function test_submission_form_renders_at_public_route(): void
|
|
||||||
{
|
|
||||||
$response = $this->get('/submit');
|
|
||||||
|
|
||||||
$response->assertStatus(200);
|
|
||||||
$response->assertSeeLivewire('url-submission-form');
|
|
||||||
}
|
|
||||||
|
|
||||||
// -------------------------------------------------------------------------
|
|
||||||
// Test 2 — valid submission creates a page row as Discovered
|
|
||||||
// -------------------------------------------------------------------------
|
|
||||||
|
|
||||||
public function test_valid_url_submission_creates_page_as_discovered(): void
|
|
||||||
{
|
|
||||||
Livewire::test(UrlSubmissionForm::class)
|
|
||||||
->set('url', 'https://example.com/interesting-post')
|
|
||||||
->call('submit')
|
|
||||||
->assertHasNoErrors();
|
|
||||||
|
|
||||||
$this->assertDatabaseHas('pages', [
|
|
||||||
'url' => 'https://example.com/interesting-post',
|
|
||||||
]);
|
|
||||||
}
|
|
||||||
|
|
||||||
// -------------------------------------------------------------------------
|
|
||||||
// Test 3 — duplicate submission is idempotent (no second row created)
|
|
||||||
// -------------------------------------------------------------------------
|
|
||||||
|
|
||||||
public function test_duplicate_url_submission_does_not_create_second_page(): void
|
|
||||||
{
|
|
||||||
$url = 'https://example.com/seen-before';
|
|
||||||
|
|
||||||
Page::factory()->create([
|
|
||||||
'url' => $url,
|
|
||||||
'status' => PageStatusEnum::Discovered,
|
|
||||||
]);
|
|
||||||
|
|
||||||
Livewire::test(UrlSubmissionForm::class)
|
|
||||||
->set('url', $url)
|
|
||||||
->call('submit')
|
|
||||||
->assertHasNoErrors();
|
|
||||||
|
|
||||||
$this->assertDatabaseCount('pages', 1);
|
|
||||||
}
|
|
||||||
|
|
||||||
// -------------------------------------------------------------------------
|
|
||||||
// Test 4 — confirmation state echoes submitted URL
|
|
||||||
// -------------------------------------------------------------------------
|
|
||||||
|
|
||||||
public function test_confirmation_state_echoes_submitted_url(): void
|
|
||||||
{
|
|
||||||
$url = 'https://example.com/great-article';
|
|
||||||
|
|
||||||
Livewire::test(UrlSubmissionForm::class)
|
|
||||||
->set('url', $url)
|
|
||||||
->call('submit')
|
|
||||||
->assertHasNoErrors()
|
|
||||||
->assertSet('confirmedUrl', $url)
|
|
||||||
->assertSet('url', '')
|
|
||||||
->assertSee($url);
|
|
||||||
}
|
|
||||||
|
|
||||||
// -------------------------------------------------------------------------
|
|
||||||
// Test 5 — empty URL fails validation (regression lock)
|
|
||||||
// -------------------------------------------------------------------------
|
|
||||||
|
|
||||||
public function test_missing_url_fails_validation(): void
|
|
||||||
{
|
|
||||||
Livewire::test(UrlSubmissionForm::class)
|
|
||||||
->set('url', '')
|
|
||||||
->call('submit')
|
|
||||||
->assertHasErrors(['url' => 'required']);
|
|
||||||
}
|
|
||||||
|
|
||||||
// -------------------------------------------------------------------------
|
|
||||||
// Test 6 — invalid URL formats fail validation
|
|
||||||
// -------------------------------------------------------------------------
|
|
||||||
|
|
||||||
#[DataProvider('invalidUrls')]
|
|
||||||
public function test_invalid_url_formats_fail_validation(string $url): void
|
|
||||||
{
|
|
||||||
Livewire::test(UrlSubmissionForm::class)
|
|
||||||
->set('url', $url)
|
|
||||||
->call('submit')
|
|
||||||
->assertHasErrors('url');
|
|
||||||
}
|
|
||||||
|
|
||||||
public static function invalidUrls(): array
|
|
||||||
{
|
|
||||||
return [
|
|
||||||
'no scheme' => ['not-a-url'],
|
|
||||||
'disallowed scheme' => ['ftp://example.com'],
|
|
||||||
'javascript scheme' => ['javascript:alert(1)'],
|
|
||||||
];
|
|
||||||
}
|
|
||||||
|
|
||||||
// -------------------------------------------------------------------------
|
|
||||||
// Integration — form submission enqueues a crawl via PageObserver
|
|
||||||
// -------------------------------------------------------------------------
|
|
||||||
|
|
||||||
public function test_url_submission_form_enqueues_crawl_via_observer(): void
|
|
||||||
{
|
|
||||||
Livewire::test(UrlSubmissionForm::class)
|
|
||||||
->set('url', 'https://example.com/article')
|
|
||||||
->call('submit')
|
|
||||||
->assertHasNoErrors();
|
|
||||||
|
|
||||||
$this->assertDatabaseCount('page_crawls', 1);
|
|
||||||
$this->assertDatabaseHas('page_crawls', ['domain' => 'example.com']);
|
|
||||||
}
|
|
||||||
|
|
||||||
// -------------------------------------------------------------------------
|
|
||||||
// Test 7 — rate limit blocks the 11th submission within a minute
|
|
||||||
// -------------------------------------------------------------------------
|
|
||||||
|
|
||||||
public function test_rate_limit_blocks_eleventh_submission_within_a_minute(): void
|
|
||||||
{
|
|
||||||
// 10 submissions within the limit — each must succeed
|
|
||||||
for ($i = 1; $i <= 10; $i++) {
|
|
||||||
Livewire::test(UrlSubmissionForm::class)
|
|
||||||
->set('url', "https://example.com/post-{$i}")
|
|
||||||
->call('submit')
|
|
||||||
->assertHasNoErrors();
|
|
||||||
}
|
|
||||||
|
|
||||||
// 11th submission from the same IP must be blocked, with the message visible
|
|
||||||
Livewire::test(UrlSubmissionForm::class)
|
|
||||||
->set('url', 'https://example.com/post-11')
|
|
||||||
->call('submit')
|
|
||||||
->assertHasErrors('rate_limit')
|
|
||||||
->assertSee('Too many submissions');
|
|
||||||
|
|
||||||
// The 11th URL must NOT have been persisted
|
|
||||||
$this->assertDatabaseCount('pages', 10);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
@ -1,83 +0,0 @@
|
||||||
<?php
|
|
||||||
|
|
||||||
declare(strict_types=1);
|
|
||||||
|
|
||||||
namespace Tests\Unit\Actions;
|
|
||||||
|
|
||||||
use App\Actions\RegisterDiscoveredPageAction;
|
|
||||||
use App\Enums\PageStatusEnum;
|
|
||||||
use App\Models\Page;
|
|
||||||
use Illuminate\Foundation\Testing\RefreshDatabase;
|
|
||||||
use Lvl0\FediDiscover\Config\InstanceType;
|
|
||||||
use Lvl0\FediDiscover\Models\Instance;
|
|
||||||
use Tests\TestCase;
|
|
||||||
|
|
||||||
class RegisterDiscoveredPageActionTest extends TestCase
|
|
||||||
{
|
|
||||||
use RefreshDatabase;
|
|
||||||
|
|
||||||
public function test_creates_page_with_url_and_discovered_status(): void
|
|
||||||
{
|
|
||||||
$action = new RegisterDiscoveredPageAction;
|
|
||||||
|
|
||||||
$page = $action('https://example.com/article');
|
|
||||||
|
|
||||||
$this->assertInstanceOf(Page::class, $page);
|
|
||||||
$this->assertSame('https://example.com/article', $page->url);
|
|
||||||
$this->assertSame(PageStatusEnum::Discovered, $page->status);
|
|
||||||
$this->assertNull($page->instance_id);
|
|
||||||
$this->assertDatabaseHas('pages', ['url' => 'https://example.com/article']);
|
|
||||||
}
|
|
||||||
|
|
||||||
public function test_creates_page_with_provided_instance_id(): void
|
|
||||||
{
|
|
||||||
$instance = Instance::factory()
|
|
||||||
->type(InstanceType::Mastodon)
|
|
||||||
->enabled()
|
|
||||||
->create();
|
|
||||||
|
|
||||||
$action = new RegisterDiscoveredPageAction;
|
|
||||||
|
|
||||||
$page = $action('https://example.com/fediverse-post', instanceId: $instance->id);
|
|
||||||
|
|
||||||
$this->assertInstanceOf(Page::class, $page);
|
|
||||||
$this->assertSame($instance->id, $page->instance_id);
|
|
||||||
$this->assertDatabaseHas('pages', [
|
|
||||||
'url' => 'https://example.com/fediverse-post',
|
|
||||||
'instance_id' => $instance->id,
|
|
||||||
]);
|
|
||||||
}
|
|
||||||
|
|
||||||
public function test_returns_existing_page_when_url_already_exists(): void
|
|
||||||
{
|
|
||||||
$existing = Page::factory()->createQuietly([
|
|
||||||
'url' => 'https://example.com/seen-before',
|
|
||||||
'status' => PageStatusEnum::Discovered,
|
|
||||||
]);
|
|
||||||
|
|
||||||
$action = new RegisterDiscoveredPageAction;
|
|
||||||
|
|
||||||
$returned = $action('https://example.com/seen-before');
|
|
||||||
|
|
||||||
$this->assertSame($existing->id, $returned->id);
|
|
||||||
$this->assertDatabaseCount('pages', 1);
|
|
||||||
}
|
|
||||||
|
|
||||||
public function test_existing_page_status_not_overwritten_on_duplicate_call(): void
|
|
||||||
{
|
|
||||||
Page::factory()->createQuietly([
|
|
||||||
'url' => 'https://example.com/already-fetched',
|
|
||||||
'status' => PageStatusEnum::Fetched,
|
|
||||||
]);
|
|
||||||
|
|
||||||
$action = new RegisterDiscoveredPageAction;
|
|
||||||
|
|
||||||
$returned = $action('https://example.com/already-fetched');
|
|
||||||
|
|
||||||
$this->assertSame(PageStatusEnum::Fetched, $returned->status);
|
|
||||||
$this->assertDatabaseHas('pages', [
|
|
||||||
'url' => 'https://example.com/already-fetched',
|
|
||||||
'status' => PageStatusEnum::Fetched,
|
|
||||||
]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
@ -1,75 +0,0 @@
|
||||||
<?php
|
|
||||||
|
|
||||||
declare(strict_types=1);
|
|
||||||
|
|
||||||
namespace Tests\Unit\Enums;
|
|
||||||
|
|
||||||
use App\Enums\CrawlOutcomeEnum;
|
|
||||||
use App\Enums\PageStatusEnum;
|
|
||||||
use Tests\TestCase;
|
|
||||||
|
|
||||||
class CrawlOutcomeEnumTest extends TestCase
|
|
||||||
{
|
|
||||||
public function test_all_expected_cases_exist_with_correct_backing_values(): void
|
|
||||||
{
|
|
||||||
$expected = [
|
|
||||||
'Success' => 'success',
|
|
||||||
'Failed' => 'failed',
|
|
||||||
'Timeout' => 'timeout',
|
|
||||||
'BlockedRobots' => 'blocked_robots',
|
|
||||||
'Blocked4xx' => 'blocked_4xx',
|
|
||||||
'Blocked5xx' => 'blocked_5xx',
|
|
||||||
'Rejected' => 'rejected',
|
|
||||||
];
|
|
||||||
|
|
||||||
foreach ($expected as $caseName => $backingValue) {
|
|
||||||
$case = CrawlOutcomeEnum::from($backingValue);
|
|
||||||
|
|
||||||
$this->assertSame($caseName, $case->name, "Case name for '{$backingValue}' should be '{$caseName}'");
|
|
||||||
$this->assertSame($backingValue, $case->value, "Backing value for '{$caseName}' should be '{$backingValue}'");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public function test_enum_has_exactly_seven_cases(): void
|
|
||||||
{
|
|
||||||
$this->assertCount(7, CrawlOutcomeEnum::cases());
|
|
||||||
}
|
|
||||||
|
|
||||||
public function test_to_page_status_maps_each_outcome_correctly(): void
|
|
||||||
{
|
|
||||||
$this->assertSame(PageStatusEnum::Fetched, CrawlOutcomeEnum::Success->toPageStatus());
|
|
||||||
$this->assertSame(PageStatusEnum::Rejected, CrawlOutcomeEnum::Rejected->toPageStatus());
|
|
||||||
$this->assertSame(PageStatusEnum::Failed, CrawlOutcomeEnum::Failed->toPageStatus());
|
|
||||||
$this->assertSame(PageStatusEnum::Failed, CrawlOutcomeEnum::Timeout->toPageStatus());
|
|
||||||
$this->assertSame(PageStatusEnum::Failed, CrawlOutcomeEnum::Blocked4xx->toPageStatus());
|
|
||||||
$this->assertSame(PageStatusEnum::Failed, CrawlOutcomeEnum::Blocked5xx->toPageStatus());
|
|
||||||
$this->assertSame(PageStatusEnum::Failed, CrawlOutcomeEnum::BlockedRobots->toPageStatus());
|
|
||||||
}
|
|
||||||
|
|
||||||
public function test_is_retryable_returns_true_only_for_transient_failures(): void
|
|
||||||
{
|
|
||||||
// Retryable: transient network/server problems that may resolve later
|
|
||||||
$this->assertTrue(CrawlOutcomeEnum::Failed->isRetryable());
|
|
||||||
$this->assertTrue(CrawlOutcomeEnum::Timeout->isRetryable());
|
|
||||||
$this->assertTrue(CrawlOutcomeEnum::Blocked5xx->isRetryable());
|
|
||||||
|
|
||||||
// Not retryable: success (done), permanent failures, or policy decisions
|
|
||||||
$this->assertFalse(CrawlOutcomeEnum::Success->isRetryable());
|
|
||||||
$this->assertFalse(CrawlOutcomeEnum::Rejected->isRetryable());
|
|
||||||
$this->assertFalse(CrawlOutcomeEnum::BlockedRobots->isRetryable());
|
|
||||||
$this->assertFalse(CrawlOutcomeEnum::Blocked4xx->isRetryable());
|
|
||||||
}
|
|
||||||
|
|
||||||
public function test_should_register_outbound_links_returns_true_only_for_success(): void
|
|
||||||
{
|
|
||||||
$this->assertTrue(CrawlOutcomeEnum::Success->shouldRegisterOutboundLinks());
|
|
||||||
|
|
||||||
// No links to register on any non-Success outcome
|
|
||||||
$this->assertFalse(CrawlOutcomeEnum::Failed->shouldRegisterOutboundLinks());
|
|
||||||
$this->assertFalse(CrawlOutcomeEnum::Timeout->shouldRegisterOutboundLinks());
|
|
||||||
$this->assertFalse(CrawlOutcomeEnum::Rejected->shouldRegisterOutboundLinks());
|
|
||||||
$this->assertFalse(CrawlOutcomeEnum::BlockedRobots->shouldRegisterOutboundLinks());
|
|
||||||
$this->assertFalse(CrawlOutcomeEnum::Blocked4xx->shouldRegisterOutboundLinks());
|
|
||||||
$this->assertFalse(CrawlOutcomeEnum::Blocked5xx->shouldRegisterOutboundLinks());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
@ -1,33 +0,0 @@
|
||||||
<?php
|
|
||||||
|
|
||||||
declare(strict_types=1);
|
|
||||||
|
|
||||||
namespace Tests\Unit\Enums;
|
|
||||||
|
|
||||||
use App\Enums\PageStatusEnum;
|
|
||||||
use Tests\TestCase;
|
|
||||||
|
|
||||||
class PageStatusEnumTest extends TestCase
|
|
||||||
{
|
|
||||||
public function test_all_expected_cases_exist_with_correct_backing_values(): void
|
|
||||||
{
|
|
||||||
$expected = [
|
|
||||||
'Discovered' => 'discovered',
|
|
||||||
'Fetched' => 'fetched',
|
|
||||||
'Failed' => 'failed',
|
|
||||||
'Rejected' => 'rejected',
|
|
||||||
];
|
|
||||||
|
|
||||||
foreach ($expected as $caseName => $backingValue) {
|
|
||||||
$case = PageStatusEnum::from($backingValue);
|
|
||||||
|
|
||||||
$this->assertSame($caseName, $case->name, "Case name for '{$backingValue}' should be '{$caseName}'");
|
|
||||||
$this->assertSame($backingValue, $case->value, "Backing value for '{$caseName}' should be '{$backingValue}'");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public function test_enum_has_exactly_four_cases(): void
|
|
||||||
{
|
|
||||||
$this->assertCount(4, PageStatusEnum::cases());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
@ -1,42 +0,0 @@
|
||||||
<?php
|
|
||||||
|
|
||||||
declare(strict_types=1);
|
|
||||||
|
|
||||||
namespace Tests\Unit\Models;
|
|
||||||
|
|
||||||
use App\Enums\CrawlOutcomeEnum;
|
|
||||||
use App\Models\Page;
|
|
||||||
use App\Models\PageCrawl;
|
|
||||||
use Carbon\Carbon;
|
|
||||||
use Illuminate\Foundation\Testing\RefreshDatabase;
|
|
||||||
use Illuminate\Support\Facades\Queue;
|
|
||||||
use Tests\TestCase;
|
|
||||||
|
|
||||||
class PageCrawlFactoryTest extends TestCase
|
|
||||||
{
|
|
||||||
use RefreshDatabase;
|
|
||||||
|
|
||||||
public function test_factory_successful_state_produces_success_outcome(): void
|
|
||||||
{
|
|
||||||
Queue::fake();
|
|
||||||
|
|
||||||
$page = Page::factory()->create();
|
|
||||||
$crawl = PageCrawl::factory()->page($page)->successful()->create();
|
|
||||||
|
|
||||||
$this->assertSame(CrawlOutcomeEnum::Success, $crawl->outcome);
|
|
||||||
$this->assertInstanceOf(Carbon::class, $crawl->completed_at);
|
|
||||||
$this->assertNull($crawl->error_message);
|
|
||||||
}
|
|
||||||
|
|
||||||
public function test_factory_failed_state_produces_failed_outcome_with_message(): void
|
|
||||||
{
|
|
||||||
Queue::fake();
|
|
||||||
|
|
||||||
$page = Page::factory()->create();
|
|
||||||
$crawl = PageCrawl::factory()->page($page)->failed('Connection timed out')->create();
|
|
||||||
|
|
||||||
$this->assertSame(CrawlOutcomeEnum::Failed, $crawl->outcome);
|
|
||||||
$this->assertInstanceOf(Carbon::class, $crawl->completed_at);
|
|
||||||
$this->assertSame('Connection timed out', $crawl->error_message);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
@ -1,111 +0,0 @@
|
||||||
<?php
|
|
||||||
|
|
||||||
declare(strict_types=1);
|
|
||||||
|
|
||||||
namespace Tests\Unit\Models;
|
|
||||||
|
|
||||||
use App\Enums\CrawlOutcomeEnum;
|
|
||||||
use App\Models\Page;
|
|
||||||
use App\Models\PageCrawl;
|
|
||||||
use Carbon\Carbon;
|
|
||||||
use Illuminate\Foundation\Testing\RefreshDatabase;
|
|
||||||
use Illuminate\Support\Facades\Queue;
|
|
||||||
use Tests\TestCase;
|
|
||||||
|
|
||||||
class PageCrawlTest extends TestCase
|
|
||||||
{
|
|
||||||
use RefreshDatabase;
|
|
||||||
|
|
||||||
public function test_page_crawl_fillable_fields_persist_and_casts_are_applied(): void
|
|
||||||
{
|
|
||||||
Queue::fake();
|
|
||||||
|
|
||||||
$page = Page::factory()->createQuietly(['url' => 'https://example.com/page-1']);
|
|
||||||
|
|
||||||
$completedAt = Carbon::parse('2026-05-01 10:01:05');
|
|
||||||
|
|
||||||
$crawl = PageCrawl::create([
|
|
||||||
'page_id' => $page->id,
|
|
||||||
'domain' => 'example.com',
|
|
||||||
'priority' => 5,
|
|
||||||
'completed_at' => $completedAt,
|
|
||||||
'outcome' => CrawlOutcomeEnum::Success,
|
|
||||||
'status_code' => 200,
|
|
||||||
'error_message' => null,
|
|
||||||
]);
|
|
||||||
|
|
||||||
$fresh = $crawl->fresh();
|
|
||||||
|
|
||||||
$this->assertNotNull($fresh);
|
|
||||||
|
|
||||||
// domain / priority round-trip
|
|
||||||
$this->assertSame('example.com', $fresh->domain);
|
|
||||||
$this->assertSame(5, $fresh->priority);
|
|
||||||
|
|
||||||
// outcome is cast to the enum
|
|
||||||
$this->assertInstanceOf(CrawlOutcomeEnum::class, $fresh->outcome);
|
|
||||||
$this->assertSame(CrawlOutcomeEnum::Success, $fresh->outcome);
|
|
||||||
|
|
||||||
// datetime casts
|
|
||||||
$this->assertInstanceOf(Carbon::class, $fresh->completed_at);
|
|
||||||
|
|
||||||
$this->assertTrue($completedAt->equalTo($fresh->completed_at));
|
|
||||||
|
|
||||||
// nullable columns
|
|
||||||
$this->assertNull($fresh->error_message);
|
|
||||||
|
|
||||||
// status_code persists
|
|
||||||
$this->assertSame(200, $fresh->status_code);
|
|
||||||
}
|
|
||||||
|
|
||||||
public function test_page_crawl_belongs_to_a_page(): void
|
|
||||||
{
|
|
||||||
$page = Page::factory()->createQuietly(['url' => 'https://example.com/page-2']);
|
|
||||||
|
|
||||||
$crawl = PageCrawl::create([
|
|
||||||
'page_id' => $page->id,
|
|
||||||
'domain' => 'example.com',
|
|
||||||
'priority' => 1,
|
|
||||||
]);
|
|
||||||
|
|
||||||
$related = $crawl->page;
|
|
||||||
|
|
||||||
$this->assertInstanceOf(Page::class, $related);
|
|
||||||
$this->assertSame($page->id, $related->id);
|
|
||||||
}
|
|
||||||
|
|
||||||
public function test_deleting_a_page_cascades_to_its_page_crawls(): void
|
|
||||||
{
|
|
||||||
// createQuietly() skips the PageObserver so the count of explicit rows is predictable;
|
|
||||||
// this test is about cascade delete behaviour, not observer side effects.
|
|
||||||
$page = Page::factory()->createQuietly(['url' => 'https://example.com/page-cascade']);
|
|
||||||
|
|
||||||
PageCrawl::factory()->page($page)->create();
|
|
||||||
PageCrawl::factory()->page($page)->successful()->create();
|
|
||||||
PageCrawl::factory()->page($page)->failed('timeout during fetch')->create();
|
|
||||||
|
|
||||||
$this->assertSame(3, PageCrawl::count());
|
|
||||||
|
|
||||||
$page->delete();
|
|
||||||
|
|
||||||
$this->assertSame(0, PageCrawl::count());
|
|
||||||
}
|
|
||||||
|
|
||||||
public function test_pending_crawls_are_filtered_by_null_outcome(): void
|
|
||||||
{
|
|
||||||
Queue::fake();
|
|
||||||
|
|
||||||
// createQuietly() skips the PageObserver; this test counts rows with null/non-null
|
|
||||||
// outcome — the auto-inserted observer crawl (outcome=null) would corrupt both counts.
|
|
||||||
$page = Page::factory()->createQuietly(['url' => 'https://example.com/page-pending']);
|
|
||||||
|
|
||||||
$pending = PageCrawl::factory()->page($page)->create();
|
|
||||||
PageCrawl::factory()->page($page)->successful()->create();
|
|
||||||
PageCrawl::factory()->page($page)->failed('connection refused')->create();
|
|
||||||
|
|
||||||
$this->assertSame(1, PageCrawl::whereNull('outcome')->count());
|
|
||||||
$this->assertSame($pending->id, PageCrawl::whereNull('outcome')->first()->id);
|
|
||||||
|
|
||||||
$this->assertSame(2, PageCrawl::whereNotNull('outcome')->count());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
@ -6,11 +6,8 @@
|
||||||
|
|
||||||
use App\Enums\PageStatusEnum;
|
use App\Enums\PageStatusEnum;
|
||||||
use App\Models\Page;
|
use App\Models\Page;
|
||||||
use App\Models\PageCrawl;
|
|
||||||
use App\Models\PageLink;
|
use App\Models\PageLink;
|
||||||
use Carbon\Carbon;
|
|
||||||
use Illuminate\Foundation\Testing\RefreshDatabase;
|
use Illuminate\Foundation\Testing\RefreshDatabase;
|
||||||
use Illuminate\Support\Facades\Queue;
|
|
||||||
use Lvl0\FediDiscover\Config\InstanceType;
|
use Lvl0\FediDiscover\Config\InstanceType;
|
||||||
use Lvl0\FediDiscover\Models\Instance;
|
use Lvl0\FediDiscover\Models\Instance;
|
||||||
use Tests\TestCase;
|
use Tests\TestCase;
|
||||||
|
|
@ -19,12 +16,6 @@ class PageTest extends TestCase
|
||||||
{
|
{
|
||||||
use RefreshDatabase;
|
use RefreshDatabase;
|
||||||
|
|
||||||
protected function setUp(): void
|
|
||||||
{
|
|
||||||
parent::setUp();
|
|
||||||
Queue::fake();
|
|
||||||
}
|
|
||||||
|
|
||||||
public function test_page_model_fillable_fields_can_be_mass_assigned(): void
|
public function test_page_model_fillable_fields_can_be_mass_assigned(): void
|
||||||
{
|
{
|
||||||
$page = Page::create([
|
$page = Page::create([
|
||||||
|
|
@ -85,73 +76,6 @@ public function test_page_outgoing_and_incoming_links_relationships(): void
|
||||||
$this->assertSame($target->id, $freshSource->outgoingLinks->first()->target_page_id);
|
$this->assertSame($target->id, $freshSource->outgoingLinks->first()->target_page_id);
|
||||||
}
|
}
|
||||||
|
|
||||||
public function test_page_language_is_fillable_and_persists(): void
|
|
||||||
{
|
|
||||||
$page = Page::create([
|
|
||||||
'url' => 'https://example.com/crawled',
|
|
||||||
'status' => 'discovered',
|
|
||||||
'language' => 'en',
|
|
||||||
]);
|
|
||||||
|
|
||||||
$fresh = $page->fresh();
|
|
||||||
|
|
||||||
$this->assertNotNull($fresh);
|
|
||||||
$this->assertSame('en', $fresh->language);
|
|
||||||
|
|
||||||
$unset = Page::create([
|
|
||||||
'url' => 'https://example.com/no-language',
|
|
||||||
'status' => 'discovered',
|
|
||||||
]);
|
|
||||||
|
|
||||||
$this->assertNull($unset->fresh()->language);
|
|
||||||
}
|
|
||||||
|
|
||||||
public function test_page_has_many_crawls(): void
|
|
||||||
{
|
|
||||||
// createQuietly() skips the PageObserver so no auto-crawl row is inserted;
|
|
||||||
// this test is about HasMany scoping, not observer side effects.
|
|
||||||
$page = Page::factory()->createQuietly();
|
|
||||||
$other = Page::factory()->createQuietly();
|
|
||||||
|
|
||||||
PageCrawl::create(['page_id' => $page->id, 'domain' => 'example.com']);
|
|
||||||
PageCrawl::create(['page_id' => $page->id, 'domain' => 'example.com']);
|
|
||||||
PageCrawl::create(['page_id' => $page->id, 'domain' => 'example.com']);
|
|
||||||
PageCrawl::create(['page_id' => $other->id, 'domain' => 'other.com']);
|
|
||||||
|
|
||||||
$crawls = $page->fresh()->crawls;
|
|
||||||
|
|
||||||
$this->assertCount(3, $crawls);
|
|
||||||
foreach ($crawls as $crawl) {
|
|
||||||
$this->assertInstanceOf(PageCrawl::class, $crawl);
|
|
||||||
$this->assertSame($page->id, $crawl->page_id);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public function test_page_latest_crawl_returns_row_with_latest_created_at(): void
|
|
||||||
{
|
|
||||||
// createQuietly() skips the PageObserver; this test is about latestOfMany ordering,
|
|
||||||
// not observer side effects. Using create() would add an observer crawl whose
|
|
||||||
// created_at is now(), making the test fragile once the hardcoded sentinel date passes.
|
|
||||||
$page = Page::factory()->createQuietly();
|
|
||||||
|
|
||||||
$old = PageCrawl::create(['page_id' => $page->id, 'domain' => 'example.com']);
|
|
||||||
$old->created_at = Carbon::parse('2026-01-01 08:00:00');
|
|
||||||
$old->save();
|
|
||||||
|
|
||||||
$middle = PageCrawl::create(['page_id' => $page->id, 'domain' => 'example.com']);
|
|
||||||
$middle->created_at = Carbon::parse('2026-03-15 12:00:00');
|
|
||||||
$middle->save();
|
|
||||||
|
|
||||||
$newest = PageCrawl::create(['page_id' => $page->id, 'domain' => 'example.com', 'error_message' => 'sentinel-latest']);
|
|
||||||
$newest->created_at = Carbon::parse('2026-05-10 18:00:00');
|
|
||||||
$newest->save();
|
|
||||||
|
|
||||||
$latest = $page->fresh()->latestCrawl;
|
|
||||||
|
|
||||||
$this->assertInstanceOf(PageCrawl::class, $latest);
|
|
||||||
$this->assertSame('sentinel-latest', $latest->error_message);
|
|
||||||
}
|
|
||||||
|
|
||||||
public function test_page_status_is_cast_to_enum(): void
|
public function test_page_status_is_cast_to_enum(): void
|
||||||
{
|
{
|
||||||
$cases = [
|
$cases = [
|
||||||
|
|
|
||||||
|
|
@ -1,23 +0,0 @@
|
||||||
<?php
|
|
||||||
|
|
||||||
declare(strict_types=1);
|
|
||||||
|
|
||||||
namespace Tests\Unit\Services;
|
|
||||||
|
|
||||||
use App\Services\PolitenessService;
|
|
||||||
use Tests\TestCase;
|
|
||||||
|
|
||||||
class PolitenessServiceTest extends TestCase
|
|
||||||
{
|
|
||||||
public function test_min_delay_for_returns_config_default(): void
|
|
||||||
{
|
|
||||||
$this->assertSame(10, (new PolitenessService)->minDelayFor('example.com'));
|
|
||||||
}
|
|
||||||
|
|
||||||
public function test_min_delay_for_respects_config_override(): void
|
|
||||||
{
|
|
||||||
config()->set('crawler.min_domain_delay_seconds', 30);
|
|
||||||
|
|
||||||
$this->assertSame(30, (new PolitenessService)->minDelayFor('example.com'));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
@ -1,111 +0,0 @@
|
||||||
<?php
|
|
||||||
|
|
||||||
declare(strict_types=1);
|
|
||||||
|
|
||||||
namespace Tests\Unit\Services;
|
|
||||||
|
|
||||||
use App\Services\UrlService;
|
|
||||||
use PHPUnit\Framework\Attributes\DataProvider;
|
|
||||||
use Tests\TestCase;
|
|
||||||
|
|
||||||
class UrlServiceTest extends TestCase
|
|
||||||
{
|
|
||||||
private UrlService $service;
|
|
||||||
|
|
||||||
protected function setUp(): void
|
|
||||||
{
|
|
||||||
parent::setUp();
|
|
||||||
|
|
||||||
$this->service = new UrlService;
|
|
||||||
}
|
|
||||||
|
|
||||||
// -------------------------------------------------------------------------
|
|
||||||
// Happy path — simple URL
|
|
||||||
// -------------------------------------------------------------------------
|
|
||||||
|
|
||||||
public function test_extracts_host_from_simple_url(): void
|
|
||||||
{
|
|
||||||
$this->assertSame('example.com', $this->service->host('https://example.com'));
|
|
||||||
}
|
|
||||||
|
|
||||||
// -------------------------------------------------------------------------
|
|
||||||
// Path, query string, and fragment are ignored
|
|
||||||
// -------------------------------------------------------------------------
|
|
||||||
|
|
||||||
#[DataProvider('urlsWithNoise')]
|
|
||||||
public function test_extracts_host_ignoring_path_query_and_fragment(string $url, string $expectedHost): void
|
|
||||||
{
|
|
||||||
$this->assertSame($expectedHost, $this->service->host($url));
|
|
||||||
}
|
|
||||||
|
|
||||||
public static function urlsWithNoise(): array
|
|
||||||
{
|
|
||||||
return [
|
|
||||||
'path only' => ['https://example.com/some/path', 'example.com'],
|
|
||||||
'path and query' => ['https://example.com/page?q=hello&lang=en', 'example.com'],
|
|
||||||
'path, query, fragment' => ['https://example.com/page?q=1#section', 'example.com'],
|
|
||||||
'http scheme with path' => ['http://news.ycombinator.com/item?id=42', 'news.ycombinator.com'],
|
|
||||||
];
|
|
||||||
}
|
|
||||||
|
|
||||||
// -------------------------------------------------------------------------
|
|
||||||
// Port number is stripped from the host
|
|
||||||
// -------------------------------------------------------------------------
|
|
||||||
|
|
||||||
public function test_strips_port_from_host(): void
|
|
||||||
{
|
|
||||||
$this->assertSame('example.com', $this->service->host('https://example.com:8080/path'));
|
|
||||||
}
|
|
||||||
|
|
||||||
// -------------------------------------------------------------------------
|
|
||||||
// Host is always returned as lowercase
|
|
||||||
// -------------------------------------------------------------------------
|
|
||||||
|
|
||||||
public function test_lowercases_host(): void
|
|
||||||
{
|
|
||||||
$this->assertSame('example.com', $this->service->host('https://EXAMPLE.COM/path'));
|
|
||||||
}
|
|
||||||
|
|
||||||
// -------------------------------------------------------------------------
|
|
||||||
// Throws on malformed, disallowed, or IP-literal input
|
|
||||||
// -------------------------------------------------------------------------
|
|
||||||
|
|
||||||
#[DataProvider('invalidInputs')]
|
|
||||||
public function test_throws_on_invalid_input(string $url): void
|
|
||||||
{
|
|
||||||
$this->expectException(\InvalidArgumentException::class);
|
|
||||||
|
|
||||||
$this->service->host($url);
|
|
||||||
}
|
|
||||||
|
|
||||||
public static function invalidInputs(): array
|
|
||||||
{
|
|
||||||
return [
|
|
||||||
// malformed / missing structure
|
|
||||||
'empty string' => [''],
|
|
||||||
'no scheme' => ['example.com/path'],
|
|
||||||
'scheme only' => ['https://'],
|
|
||||||
'bare string' => ['not a url at all'],
|
|
||||||
|
|
||||||
// disallowed schemes
|
|
||||||
'javascript scheme' => ['javascript:alert(1)'],
|
|
||||||
'ftp scheme' => ['ftp://example.com'],
|
|
||||||
'data scheme' => ['data:text/html,<h1>hi</h1>'],
|
|
||||||
|
|
||||||
// IP literals — not valid page-URL hosts for Trove's purposes
|
|
||||||
'ipv4 literal' => ['https://192.168.1.1/path'],
|
|
||||||
'ipv6 literal' => ['https://[::1]/path'],
|
|
||||||
'ipv4 without path' => ['http://10.0.0.1'],
|
|
||||||
|
|
||||||
// Embedded credentials (userinfo) — phishing/SSRF flag
|
|
||||||
'embedded credentials' => ['https://user:pass@example.com/'],
|
|
||||||
'username only' => ['https://user@example.com/'],
|
|
||||||
|
|
||||||
// IPv6 with zone identifier — zone suffix defeats FILTER_VALIDATE_IP
|
|
||||||
'ipv6 with zone' => ['https://[fe80::1%25eth0]/'],
|
|
||||||
|
|
||||||
// IPv4-mapped IPv6 — FILTER_VALIDATE_IP recognises ::ffff:x.x.x.x as valid IPv6
|
|
||||||
'ipv4 mapped ipv6' => ['https://[::ffff:192.0.2.1]/path'],
|
|
||||||
];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
@ -1,60 +0,0 @@
|
||||||
<?php
|
|
||||||
|
|
||||||
declare(strict_types=1);
|
|
||||||
|
|
||||||
namespace Tests\Unit\ValueObjects;
|
|
||||||
|
|
||||||
use App\Enums\CrawlOutcomeEnum;
|
|
||||||
use App\ValueObjects\FetchResult;
|
|
||||||
use Illuminate\Support\Collection;
|
|
||||||
use PHPUnit\Framework\TestCase;
|
|
||||||
|
|
||||||
class FetchResultTest extends TestCase
|
|
||||||
{
|
|
||||||
public function test_it_exposes_all_fields(): void
|
|
||||||
{
|
|
||||||
$result = new FetchResult(
|
|
||||||
outcome: CrawlOutcomeEnum::Success,
|
|
||||||
statusCode: 200,
|
|
||||||
finalUrl: 'https://example.com/article',
|
|
||||||
title: 'An Example Article',
|
|
||||||
extractedText: 'Lorem ipsum dolor sit amet.',
|
|
||||||
outboundLinks: collect(['https://other.com', 'https://another.com']),
|
|
||||||
wordCount: 5,
|
|
||||||
errorMessage: null,
|
|
||||||
);
|
|
||||||
|
|
||||||
$this->assertSame(CrawlOutcomeEnum::Success, $result->outcome);
|
|
||||||
$this->assertSame(200, $result->statusCode);
|
|
||||||
$this->assertSame('https://example.com/article', $result->finalUrl);
|
|
||||||
$this->assertSame('An Example Article', $result->title);
|
|
||||||
$this->assertSame('Lorem ipsum dolor sit amet.', $result->extractedText);
|
|
||||||
$this->assertInstanceOf(Collection::class, $result->outboundLinks);
|
|
||||||
$this->assertSame(['https://other.com', 'https://another.com'], $result->outboundLinks->all());
|
|
||||||
$this->assertSame(5, $result->wordCount);
|
|
||||||
$this->assertNull($result->errorMessage);
|
|
||||||
}
|
|
||||||
|
|
||||||
public function test_it_accepts_null_for_failure_outcome_fields(): void
|
|
||||||
{
|
|
||||||
$result = new FetchResult(
|
|
||||||
outcome: CrawlOutcomeEnum::Failed,
|
|
||||||
statusCode: null,
|
|
||||||
finalUrl: null,
|
|
||||||
title: null,
|
|
||||||
extractedText: null,
|
|
||||||
outboundLinks: collect(),
|
|
||||||
wordCount: null,
|
|
||||||
errorMessage: 'Could not connect',
|
|
||||||
);
|
|
||||||
|
|
||||||
$this->assertSame(CrawlOutcomeEnum::Failed, $result->outcome);
|
|
||||||
$this->assertNull($result->statusCode);
|
|
||||||
$this->assertNull($result->finalUrl);
|
|
||||||
$this->assertNull($result->title);
|
|
||||||
$this->assertNull($result->extractedText);
|
|
||||||
$this->assertSame([], $result->outboundLinks->all());
|
|
||||||
$this->assertNull($result->wordCount);
|
|
||||||
$this->assertSame('Could not connect', $result->errorMessage);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Loading…
Reference in a new issue