7 - Add page_crawls migration, PageCrawl model, factory, and Page relationships

This commit is contained in:
myrmidex 2026-04-26 14:15:49 +02:00
parent 9dd6d84d65
commit fe8ca7fc10
7 changed files with 351 additions and 0 deletions

View file

@ -10,6 +10,7 @@
use Illuminate\Database\Eloquent\Model;
use Illuminate\Database\Eloquent\Relations\BelongsTo;
use Illuminate\Database\Eloquent\Relations\HasMany;
use Illuminate\Database\Eloquent\Relations\HasOne;
use Lvl0\FediDiscover\Models\Instance;
class Page extends Model
@ -49,4 +50,14 @@ public function incomingLinks(): HasMany
{
return $this->hasMany(PageLink::class, 'target_page_id');
}
public function crawls(): HasMany
{
return $this->hasMany(PageCrawl::class);
}
public function latestCrawl(): HasOne
{
return $this->hasOne(PageCrawl::class)->latestOfMany('created_at');
}
}

41
app/Models/PageCrawl.php Normal file
View file

@ -0,0 +1,41 @@
<?php
declare(strict_types=1);
namespace App\Models;
use App\Enums\CrawlOutcomeEnum;
use Database\Factories\PageCrawlFactory;
use Illuminate\Database\Eloquent\Factories\HasFactory;
use Illuminate\Database\Eloquent\Model;
use Illuminate\Database\Eloquent\Relations\BelongsTo;
class PageCrawl extends Model
{
/** @use HasFactory<PageCrawlFactory> */
use HasFactory;
protected $fillable = [
'page_id',
'domain',
'priority',
'scheduled_for',
'completed_at',
'outcome',
'status_code',
'error_message',
'locked_at',
];
protected $casts = [
'scheduled_for' => 'datetime',
'completed_at' => 'datetime',
'outcome' => CrawlOutcomeEnum::class,
'locked_at' => 'datetime',
];
public function page(): BelongsTo
{
return $this->belongsTo(Page::class);
}
}

View file

@ -0,0 +1,71 @@
<?php
declare(strict_types=1);
namespace Database\Factories;
use App\Enums\CrawlOutcomeEnum;
use App\Models\Page;
use App\Models\PageCrawl;
use Carbon\Carbon;
use Illuminate\Database\Eloquent\Factories\Factory;
/**
* @extends Factory<PageCrawl>
*/
class PageCrawlFactory extends Factory
{
public function definition(): array
{
return [
'page_id' => null,
'domain' => 'example.com',
'priority' => 0,
'scheduled_for' => now(),
'completed_at' => null,
'outcome' => null,
'status_code' => null,
'error_message' => null,
'locked_at' => null,
];
}
public function page(Page $page): static
{
return $this->state(fn () => [
'page_id' => $page->id,
]);
}
public function successful(): static
{
return $this->state(fn () => [
'outcome' => CrawlOutcomeEnum::Success,
'completed_at' => now(),
]);
}
public function failed(string $errorMessage): static
{
return $this->state(fn () => [
'outcome' => CrawlOutcomeEnum::Failed,
'completed_at' => now(),
'error_message' => $errorMessage,
]);
}
public function scheduledAt(Carbon $scheduledAt): static
{
return $this->state(fn () => [
'scheduled_for' => $scheduledAt,
]);
}
public function locked(): static
{
return $this->state(fn () => [
'locked_at' => now(),
'outcome' => null,
]);
}
}

View file

@ -0,0 +1,47 @@
<?php
declare(strict_types=1);
use Illuminate\Database\Migrations\Migration;
use Illuminate\Database\Schema\Blueprint;
use Illuminate\Support\Facades\DB;
use Illuminate\Support\Facades\Schema;
return new class extends Migration
{
public function up(): void
{
Schema::create('page_crawls', function (Blueprint $table) {
$table->id();
$table->foreignId('page_id')
->constrained('pages')
->cascadeOnDelete();
$table->string('domain');
$table->smallInteger('priority')->default(0);
$table->timestampTz('scheduled_for')->useCurrent();
$table->timestampTz('locked_at')->nullable();
$table->timestampTz('completed_at')->nullable();
$table->string('outcome')->nullable();
$table->smallInteger('status_code')->nullable();
$table->text('error_message')->nullable();
$table->timestampsTz();
$table->index(['page_id', 'created_at']);
});
if (DB::getDriverName() === 'pgsql') {
DB::statement('CREATE INDEX page_crawls_pending_domain_idx ON page_crawls (domain) WHERE outcome IS NULL');
DB::statement('CREATE INDEX page_crawls_pending_poll_idx ON page_crawls (scheduled_for, locked_at) WHERE outcome IS NULL');
} else {
Schema::table('page_crawls', function (Blueprint $table) {
$table->index('domain');
$table->index(['scheduled_for', 'locked_at']);
});
}
}
public function down(): void
{
Schema::dropIfExists('page_crawls');
}
};

View file

@ -0,0 +1,56 @@
<?php
declare(strict_types=1);
namespace Tests\Unit\Models;
use App\Enums\CrawlOutcomeEnum;
use App\Models\Page;
use App\Models\PageCrawl;
use Carbon\Carbon;
use Illuminate\Foundation\Testing\RefreshDatabase;
use Tests\TestCase;
class PageCrawlFactoryTest extends TestCase
{
use RefreshDatabase;
public function test_factory_successful_state_produces_success_outcome(): void
{
$page = Page::factory()->create();
$crawl = PageCrawl::factory()->page($page)->successful()->create();
$this->assertSame(CrawlOutcomeEnum::Success, $crawl->outcome);
$this->assertInstanceOf(Carbon::class, $crawl->completed_at);
$this->assertNull($crawl->error_message);
}
public function test_factory_failed_state_produces_failed_outcome_with_message(): void
{
$page = Page::factory()->create();
$crawl = PageCrawl::factory()->page($page)->failed('Connection timed out')->create();
$this->assertSame(CrawlOutcomeEnum::Failed, $crawl->outcome);
$this->assertInstanceOf(Carbon::class, $crawl->completed_at);
$this->assertSame('Connection timed out', $crawl->error_message);
}
public function test_factory_locked_state_produces_in_flight_crawl(): void
{
$page = Page::factory()->create();
$crawl = PageCrawl::factory()->page($page)->locked()->create();
$this->assertInstanceOf(Carbon::class, $crawl->locked_at);
$this->assertNull($crawl->completed_at);
$this->assertNull($crawl->outcome);
}
public function test_factory_scheduled_at_state_overrides_default_scheduled_for(): void
{
$page = Page::factory()->create();
$timestamp = Carbon::parse('2026-05-01 10:00:00');
$crawl = PageCrawl::factory()->page($page)->scheduledAt($timestamp)->create();
$this->assertTrue($timestamp->equalTo($crawl->scheduled_for));
}
}

View file

@ -0,0 +1,82 @@
<?php
declare(strict_types=1);
namespace Tests\Unit\Models;
use App\Enums\CrawlOutcomeEnum;
use App\Models\Page;
use App\Models\PageCrawl;
use Carbon\Carbon;
use Illuminate\Foundation\Testing\RefreshDatabase;
use Tests\TestCase;
class PageCrawlTest extends TestCase
{
use RefreshDatabase;
public function test_page_crawl_fillable_fields_persist_and_casts_are_applied(): void
{
$page = Page::factory()->create(['url' => 'https://example.com/page-1']);
$scheduledFor = Carbon::parse('2026-05-01 10:00:00');
$lockedAt = Carbon::parse('2026-05-01 10:01:00');
$completedAt = Carbon::parse('2026-05-01 10:01:05');
$crawl = PageCrawl::create([
'page_id' => $page->id,
'domain' => 'example.com',
'priority' => 5,
'scheduled_for' => $scheduledFor,
'locked_at' => $lockedAt,
'completed_at' => $completedAt,
'outcome' => CrawlOutcomeEnum::Success,
'status_code' => 200,
'error_message' => null,
]);
$fresh = $crawl->fresh();
$this->assertNotNull($fresh);
// domain / priority round-trip
$this->assertSame('example.com', $fresh->domain);
$this->assertSame(5, $fresh->priority);
// outcome is cast to the enum
$this->assertInstanceOf(CrawlOutcomeEnum::class, $fresh->outcome);
$this->assertSame(CrawlOutcomeEnum::Success, $fresh->outcome);
// datetime casts
$this->assertInstanceOf(Carbon::class, $fresh->scheduled_for);
$this->assertInstanceOf(Carbon::class, $fresh->locked_at);
$this->assertInstanceOf(Carbon::class, $fresh->completed_at);
$this->assertTrue($scheduledFor->equalTo($fresh->scheduled_for));
$this->assertTrue($lockedAt->equalTo($fresh->locked_at));
$this->assertTrue($completedAt->equalTo($fresh->completed_at));
// nullable columns
$this->assertNull($fresh->error_message);
// status_code persists
$this->assertSame(200, $fresh->status_code);
}
public function test_page_crawl_belongs_to_a_page(): void
{
$page = Page::factory()->create(['url' => 'https://example.com/page-2']);
$crawl = PageCrawl::create([
'page_id' => $page->id,
'domain' => 'example.com',
'priority' => 1,
'scheduled_for' => Carbon::now(),
]);
$related = $crawl->page;
$this->assertInstanceOf(Page::class, $related);
$this->assertSame($page->id, $related->id);
}
}

View file

@ -6,7 +6,9 @@
use App\Enums\PageStatusEnum;
use App\Models\Page;
use App\Models\PageCrawl;
use App\Models\PageLink;
use Carbon\Carbon;
use Illuminate\Foundation\Testing\RefreshDatabase;
use Lvl0\FediDiscover\Config\InstanceType;
use Lvl0\FediDiscover\Models\Instance;
@ -97,6 +99,47 @@ public function test_page_language_is_fillable_and_persists(): void
$this->assertNull($unset->fresh()->language);
}
public function test_page_has_many_crawls(): void
{
$page = Page::factory()->create();
$other = Page::factory()->create();
PageCrawl::create(['page_id' => $page->id, 'domain' => 'example.com']);
PageCrawl::create(['page_id' => $page->id, 'domain' => 'example.com']);
PageCrawl::create(['page_id' => $page->id, 'domain' => 'example.com']);
PageCrawl::create(['page_id' => $other->id, 'domain' => 'other.com']);
$crawls = $page->fresh()->crawls;
$this->assertCount(3, $crawls);
foreach ($crawls as $crawl) {
$this->assertInstanceOf(PageCrawl::class, $crawl);
$this->assertSame($page->id, $crawl->page_id);
}
}
public function test_page_latest_crawl_returns_row_with_latest_created_at(): void
{
$page = Page::factory()->create();
$old = PageCrawl::create(['page_id' => $page->id, 'domain' => 'example.com']);
$old->created_at = Carbon::parse('2026-01-01 08:00:00');
$old->save();
$middle = PageCrawl::create(['page_id' => $page->id, 'domain' => 'example.com']);
$middle->created_at = Carbon::parse('2026-03-15 12:00:00');
$middle->save();
$newest = PageCrawl::create(['page_id' => $page->id, 'domain' => 'example.com', 'error_message' => 'sentinel-latest']);
$newest->created_at = Carbon::parse('2026-05-10 18:00:00');
$newest->save();
$latest = $page->fresh()->latestCrawl;
$this->assertInstanceOf(PageCrawl::class, $latest);
$this->assertSame('sentinel-latest', $latest->error_message);
}
public function test_page_status_is_cast_to_enum(): void
{
$cases = [