From fe8ca7fc10397eeae99e3996c8ade035e7bb2935 Mon Sep 17 00:00:00 2001 From: myrmidex Date: Sun, 26 Apr 2026 14:15:49 +0200 Subject: [PATCH] 7 - Add page_crawls migration, PageCrawl model, factory, and Page relationships --- app/Models/Page.php | 11 +++ app/Models/PageCrawl.php | 41 ++++++++++ database/factories/PageCrawlFactory.php | 71 ++++++++++++++++ ..._04_26_111140_create_page_crawls_table.php | 47 +++++++++++ tests/Unit/Models/PageCrawlFactoryTest.php | 56 +++++++++++++ tests/Unit/Models/PageCrawlTest.php | 82 +++++++++++++++++++ tests/Unit/Models/PageTest.php | 43 ++++++++++ 7 files changed, 351 insertions(+) create mode 100644 app/Models/PageCrawl.php create mode 100644 database/factories/PageCrawlFactory.php create mode 100644 database/migrations/2026_04_26_111140_create_page_crawls_table.php create mode 100644 tests/Unit/Models/PageCrawlFactoryTest.php create mode 100644 tests/Unit/Models/PageCrawlTest.php diff --git a/app/Models/Page.php b/app/Models/Page.php index ab31e45..60ce74d 100644 --- a/app/Models/Page.php +++ b/app/Models/Page.php @@ -10,6 +10,7 @@ use Illuminate\Database\Eloquent\Model; use Illuminate\Database\Eloquent\Relations\BelongsTo; use Illuminate\Database\Eloquent\Relations\HasMany; +use Illuminate\Database\Eloquent\Relations\HasOne; use Lvl0\FediDiscover\Models\Instance; class Page extends Model @@ -49,4 +50,14 @@ public function incomingLinks(): HasMany { return $this->hasMany(PageLink::class, 'target_page_id'); } + + public function crawls(): HasMany + { + return $this->hasMany(PageCrawl::class); + } + + public function latestCrawl(): HasOne + { + return $this->hasOne(PageCrawl::class)->latestOfMany('created_at'); + } } diff --git a/app/Models/PageCrawl.php b/app/Models/PageCrawl.php new file mode 100644 index 0000000..a615e77 --- /dev/null +++ b/app/Models/PageCrawl.php @@ -0,0 +1,41 @@ + */ + use HasFactory; + + protected $fillable = [ + 'page_id', + 'domain', + 'priority', + 'scheduled_for', + 'completed_at', + 'outcome', + 'status_code', + 'error_message', + 'locked_at', + ]; + + protected $casts = [ + 'scheduled_for' => 'datetime', + 'completed_at' => 'datetime', + 'outcome' => CrawlOutcomeEnum::class, + 'locked_at' => 'datetime', + ]; + + public function page(): BelongsTo + { + return $this->belongsTo(Page::class); + } +} diff --git a/database/factories/PageCrawlFactory.php b/database/factories/PageCrawlFactory.php new file mode 100644 index 0000000..80c6f7c --- /dev/null +++ b/database/factories/PageCrawlFactory.php @@ -0,0 +1,71 @@ + + */ +class PageCrawlFactory extends Factory +{ + public function definition(): array + { + return [ + 'page_id' => null, + 'domain' => 'example.com', + 'priority' => 0, + 'scheduled_for' => now(), + 'completed_at' => null, + 'outcome' => null, + 'status_code' => null, + 'error_message' => null, + 'locked_at' => null, + ]; + } + + public function page(Page $page): static + { + return $this->state(fn () => [ + 'page_id' => $page->id, + ]); + } + + public function successful(): static + { + return $this->state(fn () => [ + 'outcome' => CrawlOutcomeEnum::Success, + 'completed_at' => now(), + ]); + } + + public function failed(string $errorMessage): static + { + return $this->state(fn () => [ + 'outcome' => CrawlOutcomeEnum::Failed, + 'completed_at' => now(), + 'error_message' => $errorMessage, + ]); + } + + public function scheduledAt(Carbon $scheduledAt): static + { + return $this->state(fn () => [ + 'scheduled_for' => $scheduledAt, + ]); + } + + public function locked(): static + { + return $this->state(fn () => [ + 'locked_at' => now(), + 'outcome' => null, + ]); + } +} diff --git a/database/migrations/2026_04_26_111140_create_page_crawls_table.php b/database/migrations/2026_04_26_111140_create_page_crawls_table.php new file mode 100644 index 0000000..b423f25 --- /dev/null +++ b/database/migrations/2026_04_26_111140_create_page_crawls_table.php @@ -0,0 +1,47 @@ +id(); + $table->foreignId('page_id') + ->constrained('pages') + ->cascadeOnDelete(); + $table->string('domain'); + $table->smallInteger('priority')->default(0); + $table->timestampTz('scheduled_for')->useCurrent(); + $table->timestampTz('locked_at')->nullable(); + $table->timestampTz('completed_at')->nullable(); + $table->string('outcome')->nullable(); + $table->smallInteger('status_code')->nullable(); + $table->text('error_message')->nullable(); + $table->timestampsTz(); + + $table->index(['page_id', 'created_at']); + }); + + if (DB::getDriverName() === 'pgsql') { + DB::statement('CREATE INDEX page_crawls_pending_domain_idx ON page_crawls (domain) WHERE outcome IS NULL'); + DB::statement('CREATE INDEX page_crawls_pending_poll_idx ON page_crawls (scheduled_for, locked_at) WHERE outcome IS NULL'); + } else { + Schema::table('page_crawls', function (Blueprint $table) { + $table->index('domain'); + $table->index(['scheduled_for', 'locked_at']); + }); + } + } + + public function down(): void + { + Schema::dropIfExists('page_crawls'); + } +}; diff --git a/tests/Unit/Models/PageCrawlFactoryTest.php b/tests/Unit/Models/PageCrawlFactoryTest.php new file mode 100644 index 0000000..65d29cc --- /dev/null +++ b/tests/Unit/Models/PageCrawlFactoryTest.php @@ -0,0 +1,56 @@ +create(); + $crawl = PageCrawl::factory()->page($page)->successful()->create(); + + $this->assertSame(CrawlOutcomeEnum::Success, $crawl->outcome); + $this->assertInstanceOf(Carbon::class, $crawl->completed_at); + $this->assertNull($crawl->error_message); + } + + public function test_factory_failed_state_produces_failed_outcome_with_message(): void + { + $page = Page::factory()->create(); + $crawl = PageCrawl::factory()->page($page)->failed('Connection timed out')->create(); + + $this->assertSame(CrawlOutcomeEnum::Failed, $crawl->outcome); + $this->assertInstanceOf(Carbon::class, $crawl->completed_at); + $this->assertSame('Connection timed out', $crawl->error_message); + } + + public function test_factory_locked_state_produces_in_flight_crawl(): void + { + $page = Page::factory()->create(); + $crawl = PageCrawl::factory()->page($page)->locked()->create(); + + $this->assertInstanceOf(Carbon::class, $crawl->locked_at); + $this->assertNull($crawl->completed_at); + $this->assertNull($crawl->outcome); + } + + public function test_factory_scheduled_at_state_overrides_default_scheduled_for(): void + { + $page = Page::factory()->create(); + $timestamp = Carbon::parse('2026-05-01 10:00:00'); + $crawl = PageCrawl::factory()->page($page)->scheduledAt($timestamp)->create(); + + $this->assertTrue($timestamp->equalTo($crawl->scheduled_for)); + } +} diff --git a/tests/Unit/Models/PageCrawlTest.php b/tests/Unit/Models/PageCrawlTest.php new file mode 100644 index 0000000..107e542 --- /dev/null +++ b/tests/Unit/Models/PageCrawlTest.php @@ -0,0 +1,82 @@ +create(['url' => 'https://example.com/page-1']); + + $scheduledFor = Carbon::parse('2026-05-01 10:00:00'); + $lockedAt = Carbon::parse('2026-05-01 10:01:00'); + $completedAt = Carbon::parse('2026-05-01 10:01:05'); + + $crawl = PageCrawl::create([ + 'page_id' => $page->id, + 'domain' => 'example.com', + 'priority' => 5, + 'scheduled_for' => $scheduledFor, + 'locked_at' => $lockedAt, + 'completed_at' => $completedAt, + 'outcome' => CrawlOutcomeEnum::Success, + 'status_code' => 200, + 'error_message' => null, + ]); + + $fresh = $crawl->fresh(); + + $this->assertNotNull($fresh); + + // domain / priority round-trip + $this->assertSame('example.com', $fresh->domain); + $this->assertSame(5, $fresh->priority); + + // outcome is cast to the enum + $this->assertInstanceOf(CrawlOutcomeEnum::class, $fresh->outcome); + $this->assertSame(CrawlOutcomeEnum::Success, $fresh->outcome); + + // datetime casts + $this->assertInstanceOf(Carbon::class, $fresh->scheduled_for); + $this->assertInstanceOf(Carbon::class, $fresh->locked_at); + $this->assertInstanceOf(Carbon::class, $fresh->completed_at); + + $this->assertTrue($scheduledFor->equalTo($fresh->scheduled_for)); + $this->assertTrue($lockedAt->equalTo($fresh->locked_at)); + $this->assertTrue($completedAt->equalTo($fresh->completed_at)); + + // nullable columns + $this->assertNull($fresh->error_message); + + // status_code persists + $this->assertSame(200, $fresh->status_code); + } + + public function test_page_crawl_belongs_to_a_page(): void + { + $page = Page::factory()->create(['url' => 'https://example.com/page-2']); + + $crawl = PageCrawl::create([ + 'page_id' => $page->id, + 'domain' => 'example.com', + 'priority' => 1, + 'scheduled_for' => Carbon::now(), + ]); + + $related = $crawl->page; + + $this->assertInstanceOf(Page::class, $related); + $this->assertSame($page->id, $related->id); + } +} diff --git a/tests/Unit/Models/PageTest.php b/tests/Unit/Models/PageTest.php index 8319510..217c831 100644 --- a/tests/Unit/Models/PageTest.php +++ b/tests/Unit/Models/PageTest.php @@ -6,7 +6,9 @@ use App\Enums\PageStatusEnum; use App\Models\Page; +use App\Models\PageCrawl; use App\Models\PageLink; +use Carbon\Carbon; use Illuminate\Foundation\Testing\RefreshDatabase; use Lvl0\FediDiscover\Config\InstanceType; use Lvl0\FediDiscover\Models\Instance; @@ -97,6 +99,47 @@ public function test_page_language_is_fillable_and_persists(): void $this->assertNull($unset->fresh()->language); } + public function test_page_has_many_crawls(): void + { + $page = Page::factory()->create(); + $other = Page::factory()->create(); + + PageCrawl::create(['page_id' => $page->id, 'domain' => 'example.com']); + PageCrawl::create(['page_id' => $page->id, 'domain' => 'example.com']); + PageCrawl::create(['page_id' => $page->id, 'domain' => 'example.com']); + PageCrawl::create(['page_id' => $other->id, 'domain' => 'other.com']); + + $crawls = $page->fresh()->crawls; + + $this->assertCount(3, $crawls); + foreach ($crawls as $crawl) { + $this->assertInstanceOf(PageCrawl::class, $crawl); + $this->assertSame($page->id, $crawl->page_id); + } + } + + public function test_page_latest_crawl_returns_row_with_latest_created_at(): void + { + $page = Page::factory()->create(); + + $old = PageCrawl::create(['page_id' => $page->id, 'domain' => 'example.com']); + $old->created_at = Carbon::parse('2026-01-01 08:00:00'); + $old->save(); + + $middle = PageCrawl::create(['page_id' => $page->id, 'domain' => 'example.com']); + $middle->created_at = Carbon::parse('2026-03-15 12:00:00'); + $middle->save(); + + $newest = PageCrawl::create(['page_id' => $page->id, 'domain' => 'example.com', 'error_message' => 'sentinel-latest']); + $newest->created_at = Carbon::parse('2026-05-10 18:00:00'); + $newest->save(); + + $latest = $page->fresh()->latestCrawl; + + $this->assertInstanceOf(PageCrawl::class, $latest); + $this->assertSame('sentinel-latest', $latest->error_message); + } + public function test_page_status_is_cast_to_enum(): void { $cases = [