13 - Persist detected language and confidence on Page after successful fetch
This commit is contained in:
parent
81b3c7f70b
commit
1cba8f3fc9
2 changed files with 56 additions and 0 deletions
|
|
@ -84,6 +84,8 @@ private function updatePageStatus(FetchResult $result): void
|
||||||
'status' => $status,
|
'status' => $status,
|
||||||
'fetched_at' => now(),
|
'fetched_at' => now(),
|
||||||
'title' => $result->title,
|
'title' => $result->title,
|
||||||
|
'language' => $result->language,
|
||||||
|
'language_confidence' => $result->languageConfidence,
|
||||||
],
|
],
|
||||||
PageStatusEnum::Failed => [
|
PageStatusEnum::Failed => [
|
||||||
'status' => $status,
|
'status' => $status,
|
||||||
|
|
|
||||||
|
|
@ -457,6 +457,56 @@ public function test_handle_proceeds_through_politeness_lock_when_robots_allow()
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public function test_handle_persists_language_on_success(): void
|
||||||
|
{
|
||||||
|
Queue::fake();
|
||||||
|
|
||||||
|
$this->mockFetchPageAction(
|
||||||
|
CrawlOutcomeEnum::Success,
|
||||||
|
statusCode: 200,
|
||||||
|
title: 'Hello',
|
||||||
|
extractedText: 'hi',
|
||||||
|
wordCount: 1,
|
||||||
|
language: 'en',
|
||||||
|
languageConfidence: 0.95,
|
||||||
|
);
|
||||||
|
|
||||||
|
$page = Page::factory()->createQuietly(['url' => 'https://example.com/article']);
|
||||||
|
$crawl = PageCrawl::factory()->page($page)->createQuietly();
|
||||||
|
|
||||||
|
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
|
||||||
|
->handle();
|
||||||
|
|
||||||
|
$fresh = $page->fresh();
|
||||||
|
$this->assertSame('en', $fresh->language);
|
||||||
|
$this->assertEqualsWithDelta(0.95, $fresh->language_confidence, 0.001);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function test_handle_persists_null_language_on_success(): void
|
||||||
|
{
|
||||||
|
Queue::fake();
|
||||||
|
|
||||||
|
$this->mockFetchPageAction(
|
||||||
|
CrawlOutcomeEnum::Success,
|
||||||
|
statusCode: 200,
|
||||||
|
title: 'Hello',
|
||||||
|
extractedText: 'hi',
|
||||||
|
wordCount: 1,
|
||||||
|
language: null,
|
||||||
|
languageConfidence: null,
|
||||||
|
);
|
||||||
|
|
||||||
|
$page = Page::factory()->createQuietly(['url' => 'https://example.com/article']);
|
||||||
|
$crawl = PageCrawl::factory()->page($page)->createQuietly();
|
||||||
|
|
||||||
|
app(ProcessCrawlJob::class, ['pageCrawl' => $crawl])
|
||||||
|
->handle();
|
||||||
|
|
||||||
|
$fresh = $page->fresh();
|
||||||
|
$this->assertNull($fresh->language);
|
||||||
|
$this->assertNull($fresh->language_confidence);
|
||||||
|
}
|
||||||
|
|
||||||
private function mockFetchPageAction(
|
private function mockFetchPageAction(
|
||||||
CrawlOutcomeEnum $outcome,
|
CrawlOutcomeEnum $outcome,
|
||||||
?int $statusCode = null,
|
?int $statusCode = null,
|
||||||
|
|
@ -466,6 +516,8 @@ private function mockFetchPageAction(
|
||||||
?Collection $outboundLinks = null,
|
?Collection $outboundLinks = null,
|
||||||
?int $wordCount = null,
|
?int $wordCount = null,
|
||||||
?string $errorMessage = null,
|
?string $errorMessage = null,
|
||||||
|
?string $language = null,
|
||||||
|
?float $languageConfidence = null,
|
||||||
): void {
|
): void {
|
||||||
$fetcher = Mockery::mock(FetchPageAction::class);
|
$fetcher = Mockery::mock(FetchPageAction::class);
|
||||||
$fetcher->shouldReceive('__invoke')->andReturn(new FetchResult(
|
$fetcher->shouldReceive('__invoke')->andReturn(new FetchResult(
|
||||||
|
|
@ -477,6 +529,8 @@ private function mockFetchPageAction(
|
||||||
outboundLinks: $outboundLinks ?? collect(),
|
outboundLinks: $outboundLinks ?? collect(),
|
||||||
wordCount: $wordCount,
|
wordCount: $wordCount,
|
||||||
errorMessage: $errorMessage,
|
errorMessage: $errorMessage,
|
||||||
|
language: $language,
|
||||||
|
languageConfidence: $languageConfidence,
|
||||||
));
|
));
|
||||||
$this->app->instance(FetchPageAction::class, $fetcher);
|
$this->app->instance(FetchPageAction::class, $fetcher);
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue