The Real Article
This is the main article body that should be extracted by readability.
Multiple paragraphs prove the extractor works on the full content.
Http::response( '
Hello', 200, ['Content-Type' => 'text/html'], ), ]); $result = $this->makeAction()('https://example.com/page'); $this->assertInstanceOf(FetchResult::class, $result); $this->assertSame(CrawlOutcomeEnum::Success, $result->outcome); $this->assertSame(200, $result->statusCode); $this->assertNotNull($result->finalUrl); } public function test_4xx_response_returns_blocked_4xx(): void { Http::fake([ 'example.com/*' => Http::response('Not Found', 404), ]); $result = $this->makeAction()('https://example.com/missing'); $this->assertInstanceOf(FetchResult::class, $result); $this->assertSame(CrawlOutcomeEnum::Blocked4xx, $result->outcome); $this->assertSame(404, $result->statusCode); $this->assertIsString($result->errorMessage); $this->assertStringContainsString('404', $result->errorMessage); $this->assertNotNull($result->finalUrl); } public function test_5xx_response_returns_blocked_5xx(): void { Http::fake([ 'example.com/*' => Http::response('Service Unavailable', 503), ]); $result = $this->makeAction()('https://example.com/page'); $this->assertInstanceOf(FetchResult::class, $result); $this->assertSame(CrawlOutcomeEnum::Blocked5xx, $result->outcome); $this->assertSame(503, $result->statusCode); $this->assertIsString($result->errorMessage); $this->assertStringContainsString('503', $result->errorMessage); $this->assertNotNull($result->finalUrl); } public function test_non_html_content_type_returns_rejected(): void { Http::fake([ 'example.com/*' => Http::response( 'PDF binary stuff', 200, ['Content-Type' => 'application/pdf'], ), ]); $result = $this->makeAction()('https://example.com/document.pdf'); $this->assertInstanceOf(FetchResult::class, $result); $this->assertSame(CrawlOutcomeEnum::Rejected, $result->outcome); $this->assertSame(200, $result->statusCode); $this->assertIsString($result->errorMessage); $this->assertStringContainsString('application/pdf', $result->errorMessage); $this->assertNotNull($result->finalUrl); $this->assertNull($result->title); $this->assertNull($result->extractedText); $this->assertEmpty($result->outboundLinks); $this->assertNull($result->wordCount); } public function test_text_html_with_charset_is_accepted(): void { Http::fake([ 'example.com/*' => Http::response( 'Hello charset world', 200, ['Content-Type' => 'text/html; charset=utf-8'], ), ]); $result = $this->makeAction()('https://example.com/page'); $this->assertInstanceOf(FetchResult::class, $result); $this->assertSame(CrawlOutcomeEnum::Success, $result->outcome); $this->assertSame(200, $result->statusCode); } public function test_connection_failure_returns_failed(): void { Http::fake(function () { throw new ConnectException( 'Could not resolve host', new Request('GET', 'https://example.com/page'), null, ['errno' => 6], ); }); $result = $this->makeAction()('https://example.com/page'); $this->assertInstanceOf(FetchResult::class, $result); $this->assertSame(CrawlOutcomeEnum::Failed, $result->outcome); $this->assertNull($result->statusCode); $this->assertNull($result->finalUrl); $this->assertIsString($result->errorMessage); $this->assertNull($result->title); $this->assertNull($result->extractedText); $this->assertEmpty($result->outboundLinks); $this->assertNull($result->wordCount); } public function test_timeout_returns_timeout(): void { Http::fake(function () { throw new ConnectException( 'cURL error 28: Operation timed out', new Request('GET', 'https://example.com/page'), null, ['errno' => 28], ); }); $result = $this->makeAction()('https://example.com/page'); $this->assertInstanceOf(FetchResult::class, $result); $this->assertSame(CrawlOutcomeEnum::Timeout, $result->outcome); $this->assertNull($result->statusCode); $this->assertNull($result->finalUrl); $this->assertIsString($result->errorMessage); } public function test_success_extracts_title_from_html(): void { Http::fake([ 'example.com/*' => Http::response( 'Some content.
', 200, ['Content-Type' => 'text/html'], ), ]); $result = $this->makeAction()('https://example.com/page'); $this->assertSame(CrawlOutcomeEnum::Success, $result->outcome); $this->assertSame('My Page Title', $result->title); } public function test_success_extracts_main_text(): void { $html = <<<'HTML'This is the main article body that should be extracted by readability.
Multiple paragraphs prove the extractor works on the full content.
This article references an external article.
And a relative link to a related post on the same site.
Plus a private IP link that should be rejected.
And a credentials URL that should be rejected.
And a non-http scheme that should be rejected.
This article body has exactly nine words total here.
Content here.
', 200, ['Content-Type' => 'Text/HTML; charset=utf-8'], ), ]); $result = $this->makeAction()('https://example.com/page'); $this->assertSame(CrawlOutcomeEnum::Success, $result->outcome); } public function test_empty_href_is_filtered_from_outbound_links(): void { $html = <<<'HTML'This paragraph has an empty href anchor that should be dropped.
Jump to section 2 of this page.