2026-04-26 17:56:13 +02:00
|
|
|
<?php
|
|
|
|
|
|
|
|
|
|
declare(strict_types=1);
|
|
|
|
|
|
|
|
|
|
namespace Tests\Feature\Actions;
|
|
|
|
|
|
|
|
|
|
use App\Actions\FetchPageAction;
|
|
|
|
|
use App\Enums\CrawlOutcomeEnum;
|
2026-04-28 01:09:18 +02:00
|
|
|
use App\Services\LanguageDetectionService;
|
2026-04-26 17:56:13 +02:00
|
|
|
use App\ValueObjects\FetchResult;
|
|
|
|
|
use GuzzleHttp\Exception\ConnectException;
|
|
|
|
|
use GuzzleHttp\Psr7\Request;
|
2026-04-26 19:35:04 +02:00
|
|
|
use Illuminate\Support\Collection;
|
2026-04-26 17:56:13 +02:00
|
|
|
use Illuminate\Support\Facades\Http;
|
2026-04-30 00:26:07 +02:00
|
|
|
use Tests\Feature\TestCase;
|
2026-04-26 17:56:13 +02:00
|
|
|
|
|
|
|
|
class FetchPageActionTest extends TestCase
|
|
|
|
|
{
|
|
|
|
|
public function test_successful_html_fetch_returns_success_outcome(): void
|
|
|
|
|
{
|
|
|
|
|
Http::fake([
|
|
|
|
|
'example.com/*' => Http::response(
|
|
|
|
|
'<html><body>Hello</body></html>',
|
|
|
|
|
200,
|
|
|
|
|
['Content-Type' => 'text/html'],
|
|
|
|
|
),
|
|
|
|
|
]);
|
|
|
|
|
|
|
|
|
|
$result = $this->makeAction()('https://example.com/page');
|
|
|
|
|
|
|
|
|
|
$this->assertInstanceOf(FetchResult::class, $result);
|
|
|
|
|
$this->assertSame(CrawlOutcomeEnum::Success, $result->outcome);
|
|
|
|
|
$this->assertSame(200, $result->statusCode);
|
|
|
|
|
$this->assertNotNull($result->finalUrl);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public function test_4xx_response_returns_blocked_4xx(): void
|
|
|
|
|
{
|
|
|
|
|
Http::fake([
|
|
|
|
|
'example.com/*' => Http::response('Not Found', 404),
|
|
|
|
|
]);
|
|
|
|
|
|
|
|
|
|
$result = $this->makeAction()('https://example.com/missing');
|
|
|
|
|
|
|
|
|
|
$this->assertInstanceOf(FetchResult::class, $result);
|
|
|
|
|
$this->assertSame(CrawlOutcomeEnum::Blocked4xx, $result->outcome);
|
|
|
|
|
$this->assertSame(404, $result->statusCode);
|
|
|
|
|
$this->assertIsString($result->errorMessage);
|
|
|
|
|
$this->assertStringContainsString('404', $result->errorMessage);
|
|
|
|
|
$this->assertNotNull($result->finalUrl);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public function test_5xx_response_returns_blocked_5xx(): void
|
|
|
|
|
{
|
|
|
|
|
Http::fake([
|
|
|
|
|
'example.com/*' => Http::response('Service Unavailable', 503),
|
|
|
|
|
]);
|
|
|
|
|
|
|
|
|
|
$result = $this->makeAction()('https://example.com/page');
|
|
|
|
|
|
|
|
|
|
$this->assertInstanceOf(FetchResult::class, $result);
|
|
|
|
|
$this->assertSame(CrawlOutcomeEnum::Blocked5xx, $result->outcome);
|
|
|
|
|
$this->assertSame(503, $result->statusCode);
|
|
|
|
|
$this->assertIsString($result->errorMessage);
|
|
|
|
|
$this->assertStringContainsString('503', $result->errorMessage);
|
|
|
|
|
$this->assertNotNull($result->finalUrl);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public function test_non_html_content_type_returns_rejected(): void
|
|
|
|
|
{
|
|
|
|
|
Http::fake([
|
|
|
|
|
'example.com/*' => Http::response(
|
|
|
|
|
'PDF binary stuff',
|
|
|
|
|
200,
|
|
|
|
|
['Content-Type' => 'application/pdf'],
|
|
|
|
|
),
|
|
|
|
|
]);
|
|
|
|
|
|
|
|
|
|
$result = $this->makeAction()('https://example.com/document.pdf');
|
|
|
|
|
|
|
|
|
|
$this->assertInstanceOf(FetchResult::class, $result);
|
|
|
|
|
$this->assertSame(CrawlOutcomeEnum::Rejected, $result->outcome);
|
|
|
|
|
$this->assertSame(200, $result->statusCode);
|
|
|
|
|
$this->assertIsString($result->errorMessage);
|
|
|
|
|
$this->assertStringContainsString('application/pdf', $result->errorMessage);
|
|
|
|
|
$this->assertNotNull($result->finalUrl);
|
|
|
|
|
$this->assertNull($result->title);
|
|
|
|
|
$this->assertNull($result->extractedText);
|
|
|
|
|
$this->assertEmpty($result->outboundLinks);
|
|
|
|
|
$this->assertNull($result->wordCount);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public function test_text_html_with_charset_is_accepted(): void
|
|
|
|
|
{
|
|
|
|
|
Http::fake([
|
|
|
|
|
'example.com/*' => Http::response(
|
|
|
|
|
'<html><body>Hello charset world</body></html>',
|
|
|
|
|
200,
|
|
|
|
|
['Content-Type' => 'text/html; charset=utf-8'],
|
|
|
|
|
),
|
|
|
|
|
]);
|
|
|
|
|
|
|
|
|
|
$result = $this->makeAction()('https://example.com/page');
|
|
|
|
|
|
|
|
|
|
$this->assertInstanceOf(FetchResult::class, $result);
|
|
|
|
|
$this->assertSame(CrawlOutcomeEnum::Success, $result->outcome);
|
|
|
|
|
$this->assertSame(200, $result->statusCode);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public function test_connection_failure_returns_failed(): void
|
|
|
|
|
{
|
|
|
|
|
Http::fake(function () {
|
|
|
|
|
throw new ConnectException(
|
|
|
|
|
'Could not resolve host',
|
|
|
|
|
new Request('GET', 'https://example.com/page'),
|
|
|
|
|
null,
|
|
|
|
|
['errno' => 6],
|
|
|
|
|
);
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
$result = $this->makeAction()('https://example.com/page');
|
|
|
|
|
|
|
|
|
|
$this->assertInstanceOf(FetchResult::class, $result);
|
|
|
|
|
$this->assertSame(CrawlOutcomeEnum::Failed, $result->outcome);
|
|
|
|
|
$this->assertNull($result->statusCode);
|
|
|
|
|
$this->assertNull($result->finalUrl);
|
|
|
|
|
$this->assertIsString($result->errorMessage);
|
|
|
|
|
$this->assertNull($result->title);
|
|
|
|
|
$this->assertNull($result->extractedText);
|
|
|
|
|
$this->assertEmpty($result->outboundLinks);
|
|
|
|
|
$this->assertNull($result->wordCount);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public function test_timeout_returns_timeout(): void
|
|
|
|
|
{
|
|
|
|
|
Http::fake(function () {
|
|
|
|
|
throw new ConnectException(
|
|
|
|
|
'cURL error 28: Operation timed out',
|
|
|
|
|
new Request('GET', 'https://example.com/page'),
|
|
|
|
|
null,
|
|
|
|
|
['errno' => 28],
|
|
|
|
|
);
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
$result = $this->makeAction()('https://example.com/page');
|
|
|
|
|
|
|
|
|
|
$this->assertInstanceOf(FetchResult::class, $result);
|
|
|
|
|
$this->assertSame(CrawlOutcomeEnum::Timeout, $result->outcome);
|
|
|
|
|
$this->assertNull($result->statusCode);
|
|
|
|
|
$this->assertNull($result->finalUrl);
|
|
|
|
|
$this->assertIsString($result->errorMessage);
|
|
|
|
|
}
|
|
|
|
|
|
2026-04-26 19:35:04 +02:00
|
|
|
public function test_success_extracts_title_from_html(): void
|
|
|
|
|
{
|
|
|
|
|
Http::fake([
|
|
|
|
|
'example.com/*' => Http::response(
|
|
|
|
|
'<html><head><title>My Page Title</title></head><body><p>Some content.</p></body></html>',
|
|
|
|
|
200,
|
|
|
|
|
['Content-Type' => 'text/html'],
|
|
|
|
|
),
|
|
|
|
|
]);
|
|
|
|
|
|
|
|
|
|
$result = $this->makeAction()('https://example.com/page');
|
|
|
|
|
|
|
|
|
|
$this->assertSame(CrawlOutcomeEnum::Success, $result->outcome);
|
|
|
|
|
$this->assertSame('My Page Title', $result->title);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public function test_success_extracts_main_text(): void
|
|
|
|
|
{
|
|
|
|
|
$html = <<<'HTML'
|
|
|
|
|
<!DOCTYPE html>
|
|
|
|
|
<html>
|
|
|
|
|
<head><title>Article Title</title></head>
|
|
|
|
|
<body>
|
|
|
|
|
<nav>Navigation links</nav>
|
|
|
|
|
<article>
|
|
|
|
|
<h1>The Real Article</h1>
|
|
|
|
|
<p>This is the main article body that should be extracted by readability.</p>
|
|
|
|
|
<p>Multiple paragraphs prove the extractor works on the full content.</p>
|
|
|
|
|
</article>
|
|
|
|
|
<footer>Site footer noise</footer>
|
|
|
|
|
</body>
|
|
|
|
|
</html>
|
|
|
|
|
HTML;
|
|
|
|
|
|
|
|
|
|
Http::fake([
|
|
|
|
|
'example.com/*' => Http::response($html, 200, ['Content-Type' => 'text/html']),
|
|
|
|
|
]);
|
|
|
|
|
|
|
|
|
|
$result = $this->makeAction()('https://example.com/article');
|
|
|
|
|
|
|
|
|
|
$this->assertSame(CrawlOutcomeEnum::Success, $result->outcome);
|
|
|
|
|
$this->assertNotNull($result->extractedText);
|
|
|
|
|
$this->assertStringContainsString('main article body', $result->extractedText);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public function test_success_extracts_and_filters_outbound_links(): void
|
|
|
|
|
{
|
|
|
|
|
$html = <<<'HTML'
|
|
|
|
|
<!DOCTYPE html>
|
|
|
|
|
<html>
|
|
|
|
|
<head><title>Article With Links</title></head>
|
|
|
|
|
<body>
|
|
|
|
|
<nav>
|
|
|
|
|
<a href="/home">Home (nav, should be filtered out by Readability scope)</a>
|
|
|
|
|
</nav>
|
|
|
|
|
<article>
|
|
|
|
|
<h1>Article Title</h1>
|
|
|
|
|
<p>This article references <a href="https://other.com/article">an external article</a>.</p>
|
|
|
|
|
<p>And a <a href="/related-post">relative link to a related post</a> on the same site.</p>
|
|
|
|
|
<p>Plus a <a href="http://192.168.1.1/admin">private IP link</a> that should be rejected.</p>
|
|
|
|
|
<p>And a <a href="https://user:pass@evil.com/">credentials URL</a> that should be rejected.</p>
|
|
|
|
|
<p>And a <a href="ftp://files.example.com/">non-http scheme</a> that should be rejected.</p>
|
|
|
|
|
</article>
|
|
|
|
|
<footer>
|
|
|
|
|
<a href="/privacy">Privacy (footer, filtered by Readability scope)</a>
|
|
|
|
|
</footer>
|
|
|
|
|
</body>
|
|
|
|
|
</html>
|
|
|
|
|
HTML;
|
|
|
|
|
|
|
|
|
|
Http::fake([
|
|
|
|
|
'example.com/*' => Http::response($html, 200, ['Content-Type' => 'text/html']),
|
|
|
|
|
]);
|
|
|
|
|
|
|
|
|
|
$result = $this->makeAction()('https://example.com/article');
|
|
|
|
|
|
|
|
|
|
$this->assertSame(CrawlOutcomeEnum::Success, $result->outcome);
|
|
|
|
|
$this->assertInstanceOf(Collection::class, $result->outboundLinks);
|
|
|
|
|
$this->assertSame(2, $result->outboundLinks->count());
|
|
|
|
|
$this->assertContains('https://other.com/article', $result->outboundLinks->all());
|
|
|
|
|
$this->assertContains('https://example.com/related-post', $result->outboundLinks->all());
|
|
|
|
|
$this->assertNotContains('http://192.168.1.1/admin', $result->outboundLinks->all());
|
|
|
|
|
$this->assertNotContains('https://user:pass@evil.com/', $result->outboundLinks->all());
|
|
|
|
|
$this->assertNotContains('ftp://files.example.com/', $result->outboundLinks->all());
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public function test_success_calculates_word_count(): void
|
|
|
|
|
{
|
|
|
|
|
$html = <<<'HTML'
|
|
|
|
|
<!DOCTYPE html>
|
|
|
|
|
<html>
|
|
|
|
|
<head><title>Word Count Test</title></head>
|
|
|
|
|
<body>
|
|
|
|
|
<article>
|
|
|
|
|
<p>This article body has exactly nine words total here.</p>
|
|
|
|
|
</article>
|
|
|
|
|
</body>
|
|
|
|
|
</html>
|
|
|
|
|
HTML;
|
|
|
|
|
|
|
|
|
|
Http::fake([
|
|
|
|
|
'example.com/*' => Http::response($html, 200, ['Content-Type' => 'text/html']),
|
|
|
|
|
]);
|
|
|
|
|
|
|
|
|
|
$result = $this->makeAction()('https://example.com/article');
|
|
|
|
|
|
|
|
|
|
$this->assertSame(CrawlOutcomeEnum::Success, $result->outcome);
|
|
|
|
|
$this->assertSame(9, $result->wordCount);
|
|
|
|
|
}
|
|
|
|
|
|
2026-04-26 19:49:08 +02:00
|
|
|
public function test_uppercase_content_type_is_accepted_as_html(): void
|
|
|
|
|
{
|
|
|
|
|
Http::fake([
|
|
|
|
|
'example.com/*' => Http::response(
|
|
|
|
|
'<html><head><title>Uppercase CT</title></head><body><p>Content here.</p></body></html>',
|
|
|
|
|
200,
|
|
|
|
|
['Content-Type' => 'Text/HTML; charset=utf-8'],
|
|
|
|
|
),
|
|
|
|
|
]);
|
|
|
|
|
|
|
|
|
|
$result = $this->makeAction()('https://example.com/page');
|
|
|
|
|
|
|
|
|
|
$this->assertSame(CrawlOutcomeEnum::Success, $result->outcome);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public function test_empty_href_is_filtered_from_outbound_links(): void
|
|
|
|
|
{
|
|
|
|
|
$html = <<<'HTML'
|
|
|
|
|
<!DOCTYPE html>
|
|
|
|
|
<html>
|
|
|
|
|
<head><title>Empty Href Test</title></head>
|
|
|
|
|
<body>
|
|
|
|
|
<article>
|
|
|
|
|
<p>This paragraph has <a href="">an empty href anchor</a> that should be dropped.</p>
|
|
|
|
|
</article>
|
|
|
|
|
</body>
|
|
|
|
|
</html>
|
|
|
|
|
HTML;
|
|
|
|
|
|
|
|
|
|
Http::fake([
|
|
|
|
|
'example.com/*' => Http::response($html, 200, ['Content-Type' => 'text/html']),
|
|
|
|
|
]);
|
|
|
|
|
|
|
|
|
|
$result = $this->makeAction()('https://example.com/article');
|
|
|
|
|
|
|
|
|
|
$this->assertSame(CrawlOutcomeEnum::Success, $result->outcome);
|
|
|
|
|
$this->assertSame(0, $result->outboundLinks->count());
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public function test_fragment_only_href_is_filtered_from_outbound_links(): void
|
|
|
|
|
{
|
|
|
|
|
$html = <<<'HTML'
|
|
|
|
|
<!DOCTYPE html>
|
|
|
|
|
<html>
|
|
|
|
|
<head><title>Fragment Href Test</title></head>
|
|
|
|
|
<body>
|
|
|
|
|
<article>
|
|
|
|
|
<p>Jump to <a href="#section-2">section 2</a> of this page.</p>
|
|
|
|
|
</article>
|
|
|
|
|
</body>
|
|
|
|
|
</html>
|
|
|
|
|
HTML;
|
|
|
|
|
|
|
|
|
|
Http::fake([
|
|
|
|
|
'example.com/*' => Http::response($html, 200, ['Content-Type' => 'text/html']),
|
|
|
|
|
]);
|
|
|
|
|
|
|
|
|
|
$result = $this->makeAction()('https://example.com/article');
|
|
|
|
|
|
|
|
|
|
$this->assertSame(CrawlOutcomeEnum::Success, $result->outcome);
|
|
|
|
|
$this->assertSame(0, $result->outboundLinks->count());
|
|
|
|
|
}
|
|
|
|
|
|
2026-04-28 01:09:18 +02:00
|
|
|
public function test_sufficient_text_triggers_language_detection_and_result_propagates(): void
|
|
|
|
|
{
|
|
|
|
|
// 24 words — above the detection threshold
|
|
|
|
|
$body = <<<'HTML'
|
|
|
|
|
<!DOCTYPE html>
|
|
|
|
|
<html>
|
|
|
|
|
<head><title>Language Detection Test</title></head>
|
|
|
|
|
<body>
|
|
|
|
|
<article>
|
|
|
|
|
<p>The quick brown fox jumps over the lazy dog and then runs away into the forest
|
|
|
|
|
where many other animals live and play together every single day.</p>
|
|
|
|
|
</article>
|
|
|
|
|
</body>
|
|
|
|
|
</html>
|
|
|
|
|
HTML;
|
|
|
|
|
|
|
|
|
|
Http::fake([
|
|
|
|
|
'example.com/*' => Http::response($body, 200, ['Content-Type' => 'text/html']),
|
|
|
|
|
]);
|
|
|
|
|
|
|
|
|
|
$detection = $this->mock(LanguageDetectionService::class);
|
|
|
|
|
$detection->shouldReceive('detect')
|
|
|
|
|
->once()
|
|
|
|
|
->andReturn(['en', 0.95]);
|
|
|
|
|
|
|
|
|
|
$result = $this->makeAction()('https://example.com/article');
|
|
|
|
|
|
|
|
|
|
$this->assertSame(CrawlOutcomeEnum::Success, $result->outcome);
|
|
|
|
|
$this->assertSame('en', $result->language);
|
|
|
|
|
$this->assertSame(0.95, $result->languageConfidence);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public function test_short_body_with_html_lang_attr_skips_service_and_uses_lang_attr(): void
|
|
|
|
|
{
|
|
|
|
|
// 7 words — below the detection threshold
|
|
|
|
|
$html = <<<'HTML'
|
|
|
|
|
<!DOCTYPE html>
|
|
|
|
|
<html lang="pt-BR">
|
|
|
|
|
<head><title>Short Page</title></head>
|
|
|
|
|
<body>
|
|
|
|
|
<article>
|
|
|
|
|
<p>Too short to detect language automatically.</p>
|
|
|
|
|
</article>
|
|
|
|
|
</body>
|
|
|
|
|
</html>
|
|
|
|
|
HTML;
|
|
|
|
|
|
|
|
|
|
Http::fake([
|
|
|
|
|
'example.com/*' => Http::response($html, 200, ['Content-Type' => 'text/html']),
|
|
|
|
|
]);
|
|
|
|
|
|
|
|
|
|
$detection = $this->mock(LanguageDetectionService::class);
|
|
|
|
|
$detection->shouldReceive('detect')->never();
|
|
|
|
|
|
|
|
|
|
$result = $this->makeAction()('https://example.com/article');
|
|
|
|
|
|
|
|
|
|
$this->assertSame(CrawlOutcomeEnum::Success, $result->outcome);
|
|
|
|
|
$this->assertSame('pt-BR', $result->language);
|
|
|
|
|
$this->assertSame(1.0, $result->languageConfidence);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public function test_short_body_with_no_lang_attr_returns_null_language(): void
|
|
|
|
|
{
|
|
|
|
|
// 7 words — below the detection threshold
|
|
|
|
|
$html = <<<'HTML'
|
|
|
|
|
<!DOCTYPE html>
|
|
|
|
|
<html>
|
|
|
|
|
<head><title>Short Page</title></head>
|
|
|
|
|
<body>
|
|
|
|
|
<article>
|
|
|
|
|
<p>Too short to detect language automatically.</p>
|
|
|
|
|
</article>
|
|
|
|
|
</body>
|
|
|
|
|
</html>
|
|
|
|
|
HTML;
|
|
|
|
|
|
|
|
|
|
Http::fake([
|
|
|
|
|
'example.com/*' => Http::response($html, 200, ['Content-Type' => 'text/html']),
|
|
|
|
|
]);
|
|
|
|
|
|
|
|
|
|
$detection = $this->mock(LanguageDetectionService::class);
|
|
|
|
|
$detection->shouldReceive('detect')->never();
|
|
|
|
|
|
|
|
|
|
$result = $this->makeAction()('https://example.com/article');
|
|
|
|
|
|
|
|
|
|
$this->assertSame(CrawlOutcomeEnum::Success, $result->outcome);
|
|
|
|
|
$this->assertNull($result->language);
|
|
|
|
|
$this->assertNull($result->languageConfidence);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public function test_whitespace_only_lang_attr_is_treated_as_absent(): void
|
|
|
|
|
{
|
|
|
|
|
// 7 words — below the detection threshold; lang attr is blank/whitespace-only
|
|
|
|
|
$html = <<<'HTML'
|
|
|
|
|
<!DOCTYPE html>
|
|
|
|
|
<html lang=" ">
|
|
|
|
|
<head><title>Short Page</title></head>
|
|
|
|
|
<body>
|
|
|
|
|
<article>
|
|
|
|
|
<p>Too short to detect language automatically.</p>
|
|
|
|
|
</article>
|
|
|
|
|
</body>
|
|
|
|
|
</html>
|
|
|
|
|
HTML;
|
|
|
|
|
|
|
|
|
|
Http::fake([
|
|
|
|
|
'example.com/*' => Http::response($html, 200, ['Content-Type' => 'text/html']),
|
|
|
|
|
]);
|
|
|
|
|
|
|
|
|
|
$detection = $this->mock(LanguageDetectionService::class);
|
|
|
|
|
$detection->shouldReceive('detect')->never();
|
|
|
|
|
|
|
|
|
|
$result = $this->makeAction()('https://example.com/article');
|
|
|
|
|
|
|
|
|
|
$this->assertSame(CrawlOutcomeEnum::Success, $result->outcome);
|
|
|
|
|
$this->assertNull($result->language);
|
|
|
|
|
$this->assertNull($result->languageConfidence);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public function test_lang_attr_longer_than_35_chars_is_rejected(): void
|
|
|
|
|
{
|
|
|
|
|
// 7 words — below the detection threshold; lang attr exceeds BCP-47 column width (string(35))
|
|
|
|
|
$html = <<<'HTML'
|
|
|
|
|
<!DOCTYPE html>
|
|
|
|
|
<html lang="aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa-bbb">
|
|
|
|
|
<head><title>Short Page</title></head>
|
|
|
|
|
<body>
|
|
|
|
|
<article>
|
|
|
|
|
<p>Too short to detect language automatically.</p>
|
|
|
|
|
</article>
|
|
|
|
|
</body>
|
|
|
|
|
</html>
|
|
|
|
|
HTML;
|
|
|
|
|
|
|
|
|
|
Http::fake([
|
|
|
|
|
'example.com/*' => Http::response($html, 200, ['Content-Type' => 'text/html']),
|
|
|
|
|
]);
|
|
|
|
|
|
|
|
|
|
$detection = $this->mock(LanguageDetectionService::class);
|
|
|
|
|
$detection->shouldReceive('detect')->never();
|
|
|
|
|
|
|
|
|
|
$result = $this->makeAction()('https://example.com/article');
|
|
|
|
|
|
|
|
|
|
$this->assertSame(CrawlOutcomeEnum::Success, $result->outcome);
|
|
|
|
|
$this->assertNull($result->language);
|
|
|
|
|
$this->assertNull($result->languageConfidence);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public function test_low_confidence_detection_falls_through_to_lang_attr(): void
|
|
|
|
|
{
|
|
|
|
|
// 24 words — above the detection threshold; service returns low-confidence result
|
|
|
|
|
$html = <<<'HTML'
|
|
|
|
|
<!DOCTYPE html>
|
|
|
|
|
<html lang="en-US">
|
|
|
|
|
<head><title>Confidence Floor Test</title></head>
|
|
|
|
|
<body>
|
|
|
|
|
<article>
|
|
|
|
|
<p>The quick brown fox jumps over the lazy dog and then runs away into the forest
|
|
|
|
|
where many other animals live and play together every single day.</p>
|
|
|
|
|
</article>
|
|
|
|
|
</body>
|
|
|
|
|
</html>
|
|
|
|
|
HTML;
|
|
|
|
|
|
|
|
|
|
Http::fake([
|
|
|
|
|
'example.com/*' => Http::response($html, 200, ['Content-Type' => 'text/html']),
|
|
|
|
|
]);
|
|
|
|
|
|
|
|
|
|
$detection = $this->mock(LanguageDetectionService::class);
|
|
|
|
|
$detection->shouldReceive('detect')
|
|
|
|
|
->once()
|
|
|
|
|
->andReturn(['xx', 0.15]);
|
|
|
|
|
|
|
|
|
|
$result = $this->makeAction()('https://example.com/article');
|
|
|
|
|
|
|
|
|
|
$this->assertSame(CrawlOutcomeEnum::Success, $result->outcome);
|
|
|
|
|
$this->assertSame('en-US', $result->language);
|
|
|
|
|
$this->assertSame(1.0, $result->languageConfidence);
|
|
|
|
|
}
|
|
|
|
|
|
2026-04-26 17:56:13 +02:00
|
|
|
private function makeAction(): FetchPageAction
|
|
|
|
|
{
|
|
|
|
|
return app(FetchPageAction::class);
|
|
|
|
|
}
|
|
|
|
|
}
|