Http::response(
'
Hello',
200,
['Content-Type' => 'text/html'],
),
]);
$result = $this->makeAction()('https://example.com/page');
$this->assertInstanceOf(FetchResult::class, $result);
$this->assertSame(CrawlOutcomeEnum::Success, $result->outcome);
$this->assertSame(200, $result->statusCode);
$this->assertNotNull($result->finalUrl);
}
public function test_4xx_response_returns_blocked_4xx(): void
{
Http::fake([
'example.com/*' => Http::response('Not Found', 404),
]);
$result = $this->makeAction()('https://example.com/missing');
$this->assertInstanceOf(FetchResult::class, $result);
$this->assertSame(CrawlOutcomeEnum::Blocked4xx, $result->outcome);
$this->assertSame(404, $result->statusCode);
$this->assertIsString($result->errorMessage);
$this->assertStringContainsString('404', $result->errorMessage);
$this->assertNotNull($result->finalUrl);
}
public function test_5xx_response_returns_blocked_5xx(): void
{
Http::fake([
'example.com/*' => Http::response('Service Unavailable', 503),
]);
$result = $this->makeAction()('https://example.com/page');
$this->assertInstanceOf(FetchResult::class, $result);
$this->assertSame(CrawlOutcomeEnum::Blocked5xx, $result->outcome);
$this->assertSame(503, $result->statusCode);
$this->assertIsString($result->errorMessage);
$this->assertStringContainsString('503', $result->errorMessage);
$this->assertNotNull($result->finalUrl);
}
public function test_non_html_content_type_returns_rejected(): void
{
Http::fake([
'example.com/*' => Http::response(
'PDF binary stuff',
200,
['Content-Type' => 'application/pdf'],
),
]);
$result = $this->makeAction()('https://example.com/document.pdf');
$this->assertInstanceOf(FetchResult::class, $result);
$this->assertSame(CrawlOutcomeEnum::Rejected, $result->outcome);
$this->assertSame(200, $result->statusCode);
$this->assertIsString($result->errorMessage);
$this->assertStringContainsString('application/pdf', $result->errorMessage);
$this->assertNotNull($result->finalUrl);
$this->assertNull($result->title);
$this->assertNull($result->extractedText);
$this->assertEmpty($result->outboundLinks);
$this->assertNull($result->wordCount);
}
public function test_text_html_with_charset_is_accepted(): void
{
Http::fake([
'example.com/*' => Http::response(
'Hello charset world',
200,
['Content-Type' => 'text/html; charset=utf-8'],
),
]);
$result = $this->makeAction()('https://example.com/page');
$this->assertInstanceOf(FetchResult::class, $result);
$this->assertSame(CrawlOutcomeEnum::Success, $result->outcome);
$this->assertSame(200, $result->statusCode);
}
public function test_connection_failure_returns_failed(): void
{
Http::fake(function () {
throw new ConnectException(
'Could not resolve host',
new Request('GET', 'https://example.com/page'),
null,
['errno' => 6],
);
});
$result = $this->makeAction()('https://example.com/page');
$this->assertInstanceOf(FetchResult::class, $result);
$this->assertSame(CrawlOutcomeEnum::Failed, $result->outcome);
$this->assertNull($result->statusCode);
$this->assertNull($result->finalUrl);
$this->assertIsString($result->errorMessage);
$this->assertNull($result->title);
$this->assertNull($result->extractedText);
$this->assertEmpty($result->outboundLinks);
$this->assertNull($result->wordCount);
}
public function test_timeout_returns_timeout(): void
{
Http::fake(function () {
throw new ConnectException(
'cURL error 28: Operation timed out',
new Request('GET', 'https://example.com/page'),
null,
['errno' => 28],
);
});
$result = $this->makeAction()('https://example.com/page');
$this->assertInstanceOf(FetchResult::class, $result);
$this->assertSame(CrawlOutcomeEnum::Timeout, $result->outcome);
$this->assertNull($result->statusCode);
$this->assertNull($result->finalUrl);
$this->assertIsString($result->errorMessage);
}
public function test_success_extracts_title_from_html(): void
{
Http::fake([
'example.com/*' => Http::response(
'My Page TitleSome content.
',
200,
['Content-Type' => 'text/html'],
),
]);
$result = $this->makeAction()('https://example.com/page');
$this->assertSame(CrawlOutcomeEnum::Success, $result->outcome);
$this->assertSame('My Page Title', $result->title);
}
public function test_success_extracts_main_text(): void
{
$html = <<<'HTML'
Article Title
The Real Article
This is the main article body that should be extracted by readability.
Multiple paragraphs prove the extractor works on the full content.
HTML;
Http::fake([
'example.com/*' => Http::response($html, 200, ['Content-Type' => 'text/html']),
]);
$result = $this->makeAction()('https://example.com/article');
$this->assertSame(CrawlOutcomeEnum::Success, $result->outcome);
$this->assertNotNull($result->extractedText);
$this->assertStringContainsString('main article body', $result->extractedText);
}
public function test_success_extracts_and_filters_outbound_links(): void
{
$html = <<<'HTML'
Article With Links
Article Title
This article references an external article.
And a relative link to a related post on the same site.
Plus a private IP link that should be rejected.
And a credentials URL that should be rejected.
And a non-http scheme that should be rejected.
HTML;
Http::fake([
'example.com/*' => Http::response($html, 200, ['Content-Type' => 'text/html']),
]);
$result = $this->makeAction()('https://example.com/article');
$this->assertSame(CrawlOutcomeEnum::Success, $result->outcome);
$this->assertInstanceOf(Collection::class, $result->outboundLinks);
$this->assertSame(2, $result->outboundLinks->count());
$this->assertContains('https://other.com/article', $result->outboundLinks->all());
$this->assertContains('https://example.com/related-post', $result->outboundLinks->all());
$this->assertNotContains('http://192.168.1.1/admin', $result->outboundLinks->all());
$this->assertNotContains('https://user:pass@evil.com/', $result->outboundLinks->all());
$this->assertNotContains('ftp://files.example.com/', $result->outboundLinks->all());
}
public function test_success_calculates_word_count(): void
{
$html = <<<'HTML'
Word Count Test
This article body has exactly nine words total here.
HTML;
Http::fake([
'example.com/*' => Http::response($html, 200, ['Content-Type' => 'text/html']),
]);
$result = $this->makeAction()('https://example.com/article');
$this->assertSame(CrawlOutcomeEnum::Success, $result->outcome);
$this->assertSame(9, $result->wordCount);
}
public function test_uppercase_content_type_is_accepted_as_html(): void
{
Http::fake([
'example.com/*' => Http::response(
'Uppercase CTContent here.
',
200,
['Content-Type' => 'Text/HTML; charset=utf-8'],
),
]);
$result = $this->makeAction()('https://example.com/page');
$this->assertSame(CrawlOutcomeEnum::Success, $result->outcome);
}
public function test_empty_href_is_filtered_from_outbound_links(): void
{
$html = <<<'HTML'
Empty Href Test
This paragraph has an empty href anchor that should be dropped.
HTML;
Http::fake([
'example.com/*' => Http::response($html, 200, ['Content-Type' => 'text/html']),
]);
$result = $this->makeAction()('https://example.com/article');
$this->assertSame(CrawlOutcomeEnum::Success, $result->outcome);
$this->assertSame(0, $result->outboundLinks->count());
}
public function test_fragment_only_href_is_filtered_from_outbound_links(): void
{
$html = <<<'HTML'
Fragment Href Test
Jump to section 2 of this page.
HTML;
Http::fake([
'example.com/*' => Http::response($html, 200, ['Content-Type' => 'text/html']),
]);
$result = $this->makeAction()('https://example.com/article');
$this->assertSame(CrawlOutcomeEnum::Success, $result->outcome);
$this->assertSame(0, $result->outboundLinks->count());
}
public function test_sufficient_text_triggers_language_detection_and_result_propagates(): void
{
// 24 words — above the detection threshold
$body = <<<'HTML'
Language Detection Test
The quick brown fox jumps over the lazy dog and then runs away into the forest
where many other animals live and play together every single day.
HTML;
Http::fake([
'example.com/*' => Http::response($body, 200, ['Content-Type' => 'text/html']),
]);
$detection = $this->mock(LanguageDetectionService::class);
$detection->shouldReceive('detect')
->once()
->andReturn(['en', 0.95]);
$result = $this->makeAction()('https://example.com/article');
$this->assertSame(CrawlOutcomeEnum::Success, $result->outcome);
$this->assertSame('en', $result->language);
$this->assertSame(0.95, $result->languageConfidence);
}
public function test_short_body_with_html_lang_attr_skips_service_and_uses_lang_attr(): void
{
// 7 words — below the detection threshold
$html = <<<'HTML'
Short Page
Too short to detect language automatically.
HTML;
Http::fake([
'example.com/*' => Http::response($html, 200, ['Content-Type' => 'text/html']),
]);
$detection = $this->mock(LanguageDetectionService::class);
$detection->shouldReceive('detect')->never();
$result = $this->makeAction()('https://example.com/article');
$this->assertSame(CrawlOutcomeEnum::Success, $result->outcome);
$this->assertSame('pt-BR', $result->language);
$this->assertSame(1.0, $result->languageConfidence);
}
public function test_short_body_with_no_lang_attr_returns_null_language(): void
{
// 7 words — below the detection threshold
$html = <<<'HTML'
Short Page
Too short to detect language automatically.
HTML;
Http::fake([
'example.com/*' => Http::response($html, 200, ['Content-Type' => 'text/html']),
]);
$detection = $this->mock(LanguageDetectionService::class);
$detection->shouldReceive('detect')->never();
$result = $this->makeAction()('https://example.com/article');
$this->assertSame(CrawlOutcomeEnum::Success, $result->outcome);
$this->assertNull($result->language);
$this->assertNull($result->languageConfidence);
}
public function test_whitespace_only_lang_attr_is_treated_as_absent(): void
{
// 7 words — below the detection threshold; lang attr is blank/whitespace-only
$html = <<<'HTML'
Short Page
Too short to detect language automatically.
HTML;
Http::fake([
'example.com/*' => Http::response($html, 200, ['Content-Type' => 'text/html']),
]);
$detection = $this->mock(LanguageDetectionService::class);
$detection->shouldReceive('detect')->never();
$result = $this->makeAction()('https://example.com/article');
$this->assertSame(CrawlOutcomeEnum::Success, $result->outcome);
$this->assertNull($result->language);
$this->assertNull($result->languageConfidence);
}
public function test_lang_attr_longer_than_35_chars_is_rejected(): void
{
// 7 words — below the detection threshold; lang attr exceeds BCP-47 column width (string(35))
$html = <<<'HTML'
Short Page
Too short to detect language automatically.
HTML;
Http::fake([
'example.com/*' => Http::response($html, 200, ['Content-Type' => 'text/html']),
]);
$detection = $this->mock(LanguageDetectionService::class);
$detection->shouldReceive('detect')->never();
$result = $this->makeAction()('https://example.com/article');
$this->assertSame(CrawlOutcomeEnum::Success, $result->outcome);
$this->assertNull($result->language);
$this->assertNull($result->languageConfidence);
}
public function test_low_confidence_detection_falls_through_to_lang_attr(): void
{
// 24 words — above the detection threshold; service returns low-confidence result
$html = <<<'HTML'
Confidence Floor Test
The quick brown fox jumps over the lazy dog and then runs away into the forest
where many other animals live and play together every single day.
HTML;
Http::fake([
'example.com/*' => Http::response($html, 200, ['Content-Type' => 'text/html']),
]);
$detection = $this->mock(LanguageDetectionService::class);
$detection->shouldReceive('detect')
->once()
->andReturn(['xx', 0.15]);
$result = $this->makeAction()('https://example.com/article');
$this->assertSame(CrawlOutcomeEnum::Success, $result->outcome);
$this->assertSame('en-US', $result->language);
$this->assertSame(1.0, $result->languageConfidence);
}
private function makeAction(): FetchPageAction
{
return app(FetchPageAction::class);
}
}