12 - Add HTML content extraction (title, text, links, word count)
This commit is contained in:
parent
3e2fd0d2c4
commit
35e1147823
4 changed files with 386 additions and 8 deletions
|
|
@ -5,11 +5,18 @@
|
||||||
namespace App\Actions;
|
namespace App\Actions;
|
||||||
|
|
||||||
use App\Enums\CrawlOutcomeEnum;
|
use App\Enums\CrawlOutcomeEnum;
|
||||||
|
use App\Services\UrlService;
|
||||||
use App\ValueObjects\FetchResult;
|
use App\ValueObjects\FetchResult;
|
||||||
|
use fivefilters\Readability\Configuration;
|
||||||
|
use fivefilters\Readability\Readability;
|
||||||
use GuzzleHttp\Exception\ConnectException;
|
use GuzzleHttp\Exception\ConnectException;
|
||||||
use Illuminate\Http\Client\ConnectionException;
|
use Illuminate\Http\Client\ConnectionException;
|
||||||
use Illuminate\Http\Client\Factory;
|
use Illuminate\Http\Client\Factory;
|
||||||
use Illuminate\Http\Client\Response;
|
use Illuminate\Http\Client\Response;
|
||||||
|
use InvalidArgumentException;
|
||||||
|
use League\Uri\BaseUri;
|
||||||
|
use Symfony\Component\DomCrawler\Crawler;
|
||||||
|
use Throwable;
|
||||||
|
|
||||||
class FetchPageAction
|
class FetchPageAction
|
||||||
{
|
{
|
||||||
|
|
@ -37,14 +44,19 @@ public function __invoke(string $url): FetchResult
|
||||||
|
|
||||||
[$outcome, $error] = $this->validateResponse($response);
|
[$outcome, $error] = $this->validateResponse($response);
|
||||||
|
|
||||||
|
if ($outcome === CrawlOutcomeEnum::Success) {
|
||||||
|
[$title, $extractedText, $links] = $this->extractTitleTextAndLinks($response->body(), $url);
|
||||||
|
$wordCount = count(preg_split('/\s+/u', trim($extractedText)));
|
||||||
|
}
|
||||||
|
|
||||||
return new FetchResult(
|
return new FetchResult(
|
||||||
outcome: $outcome,
|
outcome: $outcome,
|
||||||
statusCode: $response->status(),
|
statusCode: $response->status(),
|
||||||
finalUrl: $url,
|
finalUrl: $url,
|
||||||
title: null,
|
title: $title ?? null,
|
||||||
extractedText: null,
|
extractedText: $extractedText ?? null,
|
||||||
outboundLinks: collect(),
|
outboundLinks: $links ?? collect(),
|
||||||
wordCount: null,
|
wordCount: $wordCount ?? null,
|
||||||
errorMessage: $error ?? null,
|
errorMessage: $error ?? null,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
@ -95,4 +107,52 @@ private function failureResult(ConnectionException|ConnectException $e): FetchRe
|
||||||
errorMessage: $e->getMessage(),
|
errorMessage: $e->getMessage(),
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private function extractTitleTextAndLinks(string $body, string $url): array
|
||||||
|
{
|
||||||
|
$crawler = new Crawler($body);
|
||||||
|
|
||||||
|
$title = $crawler->filter('title')->count() > 0
|
||||||
|
? trim($crawler->filter('title')->text())
|
||||||
|
: null;
|
||||||
|
|
||||||
|
$readability = new Readability(new Configuration);
|
||||||
|
$readability->parse($body);
|
||||||
|
$mainContent = $readability->getContent() ?? '';
|
||||||
|
$extractedText = trim(strip_tags($mainContent));
|
||||||
|
|
||||||
|
$links = collect();
|
||||||
|
if ($mainContent !== '') {
|
||||||
|
$linkCrawler = new Crawler($mainContent);
|
||||||
|
if ($linkCrawler->filter('a[href]')->count() > 0) {
|
||||||
|
$links = collect($linkCrawler->filter('a[href]')->extract(['href']));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
$linksResolved = $links
|
||||||
|
->map(fn (string $href) => $this->resolveAndValidateLink($href, $url))
|
||||||
|
->filter()
|
||||||
|
->unique()
|
||||||
|
->values();
|
||||||
|
|
||||||
|
return [$title, $extractedText, $linksResolved];
|
||||||
|
}
|
||||||
|
|
||||||
|
private function resolveAndValidateLink(string $href, string $finalUrl): ?string
|
||||||
|
{
|
||||||
|
try {
|
||||||
|
$resolved = (string) BaseUri::from($finalUrl)->resolve($href);
|
||||||
|
$resolved = strstr($resolved, '#', true) ?: $resolved;
|
||||||
|
} catch (Throwable) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
app(UrlService::class)->host($resolved);
|
||||||
|
} catch (InvalidArgumentException) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
return $resolved;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -16,10 +16,12 @@
|
||||||
],
|
],
|
||||||
"require": {
|
"require": {
|
||||||
"php": "^8.3",
|
"php": "^8.3",
|
||||||
|
"fivefilters/readability.php": "@dev",
|
||||||
"laravel/framework": "^13.0",
|
"laravel/framework": "^13.0",
|
||||||
"laravel/tinker": "^3.0",
|
"laravel/tinker": "^3.0",
|
||||||
"livewire/livewire": "^4.2",
|
"livewire/livewire": "^4.2",
|
||||||
"lvl0/fedi-discover": "@dev"
|
"lvl0/fedi-discover": "@dev",
|
||||||
|
"symfony/dom-crawler": "^7.4"
|
||||||
},
|
},
|
||||||
"require-dev": {
|
"require-dev": {
|
||||||
"fakerphp/faker": "^1.23",
|
"fakerphp/faker": "^1.23",
|
||||||
|
|
|
||||||
211
composer.lock
generated
211
composer.lock
generated
|
|
@ -4,7 +4,7 @@
|
||||||
"Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies",
|
"Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies",
|
||||||
"This file is @generated automatically"
|
"This file is @generated automatically"
|
||||||
],
|
],
|
||||||
"content-hash": "e46e58784ec34415557c78db6bb6c97e",
|
"content-hash": "30d45d9b30092cc20f9364f7c3828aa5",
|
||||||
"packages": [
|
"packages": [
|
||||||
{
|
{
|
||||||
"name": "brick/math",
|
"name": "brick/math",
|
||||||
|
|
@ -508,6 +508,71 @@
|
||||||
],
|
],
|
||||||
"time": "2025-03-06T22:45:56+00:00"
|
"time": "2025-03-06T22:45:56+00:00"
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"name": "fivefilters/readability.php",
|
||||||
|
"version": "v3.3.3",
|
||||||
|
"source": {
|
||||||
|
"type": "git",
|
||||||
|
"url": "https://github.com/fivefilters/readability.php.git",
|
||||||
|
"reference": "e2ee7b9e49eae89ac7ed2c74b15718100a73b4c8"
|
||||||
|
},
|
||||||
|
"dist": {
|
||||||
|
"type": "zip",
|
||||||
|
"url": "https://api.github.com/repos/fivefilters/readability.php/zipball/e2ee7b9e49eae89ac7ed2c74b15718100a73b4c8",
|
||||||
|
"reference": "e2ee7b9e49eae89ac7ed2c74b15718100a73b4c8",
|
||||||
|
"shasum": ""
|
||||||
|
},
|
||||||
|
"require": {
|
||||||
|
"ext-dom": "*",
|
||||||
|
"ext-mbstring": "*",
|
||||||
|
"ext-xml": "*",
|
||||||
|
"league/uri": "^7.0",
|
||||||
|
"masterminds/html5": "^2.0",
|
||||||
|
"php": ">=8.1",
|
||||||
|
"psr/log": "^1.0 || ^2.0 || ^3.0"
|
||||||
|
},
|
||||||
|
"require-dev": {
|
||||||
|
"monolog/monolog": "^3.0",
|
||||||
|
"phpunit/phpunit": "^10.0 || ^11.0"
|
||||||
|
},
|
||||||
|
"suggest": {
|
||||||
|
"monolog/monolog": "Allow logging debug information"
|
||||||
|
},
|
||||||
|
"type": "library",
|
||||||
|
"autoload": {
|
||||||
|
"psr-4": {
|
||||||
|
"fivefilters\\Readability\\": "src/"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"notification-url": "https://packagist.org/downloads/",
|
||||||
|
"license": [
|
||||||
|
"Apache-2.0"
|
||||||
|
],
|
||||||
|
"authors": [
|
||||||
|
{
|
||||||
|
"name": "Andres Rey",
|
||||||
|
"email": "andreskrey@gmail.com",
|
||||||
|
"role": "Original Developer"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "Keyvan Minoukadeh",
|
||||||
|
"email": "keyvan@fivefilters.org",
|
||||||
|
"homepage": "https://www.fivefilters.org",
|
||||||
|
"role": "Developer/Maintainer"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"description": "A PHP port of Readability.js",
|
||||||
|
"homepage": "https://github.com/fivefilters/readability.php",
|
||||||
|
"keywords": [
|
||||||
|
"html",
|
||||||
|
"readability"
|
||||||
|
],
|
||||||
|
"support": {
|
||||||
|
"issues": "https://github.com/fivefilters/readability.php/issues",
|
||||||
|
"source": "https://github.com/fivefilters/readability.php/tree/v3.3.3"
|
||||||
|
},
|
||||||
|
"time": "2025-04-26T23:45:37+00:00"
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"name": "fruitcake/php-cors",
|
"name": "fruitcake/php-cors",
|
||||||
"version": "v1.4.0",
|
"version": "v1.4.0",
|
||||||
|
|
@ -2102,7 +2167,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "lvl0/fedi-discover",
|
"name": "lvl0/fedi-discover",
|
||||||
"version": "dev-main",
|
"version": "dev-release/0.1.0",
|
||||||
"dist": {
|
"dist": {
|
||||||
"type": "path",
|
"type": "path",
|
||||||
"url": "packages/Lvl0/FediDiscover",
|
"url": "packages/Lvl0/FediDiscover",
|
||||||
|
|
@ -2142,6 +2207,73 @@
|
||||||
"relative": true
|
"relative": true
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"name": "masterminds/html5",
|
||||||
|
"version": "2.10.0",
|
||||||
|
"source": {
|
||||||
|
"type": "git",
|
||||||
|
"url": "https://github.com/Masterminds/html5-php.git",
|
||||||
|
"reference": "fcf91eb64359852f00d921887b219479b4f21251"
|
||||||
|
},
|
||||||
|
"dist": {
|
||||||
|
"type": "zip",
|
||||||
|
"url": "https://api.github.com/repos/Masterminds/html5-php/zipball/fcf91eb64359852f00d921887b219479b4f21251",
|
||||||
|
"reference": "fcf91eb64359852f00d921887b219479b4f21251",
|
||||||
|
"shasum": ""
|
||||||
|
},
|
||||||
|
"require": {
|
||||||
|
"ext-dom": "*",
|
||||||
|
"php": ">=5.3.0"
|
||||||
|
},
|
||||||
|
"require-dev": {
|
||||||
|
"phpunit/phpunit": "^4.8.35 || ^5.7.21 || ^6 || ^7 || ^8 || ^9"
|
||||||
|
},
|
||||||
|
"type": "library",
|
||||||
|
"extra": {
|
||||||
|
"branch-alias": {
|
||||||
|
"dev-master": "2.7-dev"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"autoload": {
|
||||||
|
"psr-4": {
|
||||||
|
"Masterminds\\": "src"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"notification-url": "https://packagist.org/downloads/",
|
||||||
|
"license": [
|
||||||
|
"MIT"
|
||||||
|
],
|
||||||
|
"authors": [
|
||||||
|
{
|
||||||
|
"name": "Matt Butcher",
|
||||||
|
"email": "technosophos@gmail.com"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "Matt Farina",
|
||||||
|
"email": "matt@mattfarina.com"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "Asmir Mustafic",
|
||||||
|
"email": "goetas@gmail.com"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"description": "An HTML5 parser and serializer.",
|
||||||
|
"homepage": "http://masterminds.github.io/html5-php",
|
||||||
|
"keywords": [
|
||||||
|
"HTML5",
|
||||||
|
"dom",
|
||||||
|
"html",
|
||||||
|
"parser",
|
||||||
|
"querypath",
|
||||||
|
"serializer",
|
||||||
|
"xml"
|
||||||
|
],
|
||||||
|
"support": {
|
||||||
|
"issues": "https://github.com/Masterminds/html5-php/issues",
|
||||||
|
"source": "https://github.com/Masterminds/html5-php/tree/2.10.0"
|
||||||
|
},
|
||||||
|
"time": "2025-07-25T09:04:22+00:00"
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"name": "monolog/monolog",
|
"name": "monolog/monolog",
|
||||||
"version": "3.10.0",
|
"version": "3.10.0",
|
||||||
|
|
@ -3729,6 +3861,78 @@
|
||||||
],
|
],
|
||||||
"time": "2024-09-25T14:21:43+00:00"
|
"time": "2024-09-25T14:21:43+00:00"
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"name": "symfony/dom-crawler",
|
||||||
|
"version": "v7.4.8",
|
||||||
|
"source": {
|
||||||
|
"type": "git",
|
||||||
|
"url": "https://github.com/symfony/dom-crawler.git",
|
||||||
|
"reference": "2918e7c2ba964defca1f5b69c6f74886529e2dc8"
|
||||||
|
},
|
||||||
|
"dist": {
|
||||||
|
"type": "zip",
|
||||||
|
"url": "https://api.github.com/repos/symfony/dom-crawler/zipball/2918e7c2ba964defca1f5b69c6f74886529e2dc8",
|
||||||
|
"reference": "2918e7c2ba964defca1f5b69c6f74886529e2dc8",
|
||||||
|
"shasum": ""
|
||||||
|
},
|
||||||
|
"require": {
|
||||||
|
"masterminds/html5": "^2.6",
|
||||||
|
"php": ">=8.2",
|
||||||
|
"symfony/deprecation-contracts": "^2.5|^3",
|
||||||
|
"symfony/polyfill-ctype": "~1.8",
|
||||||
|
"symfony/polyfill-mbstring": "~1.0"
|
||||||
|
},
|
||||||
|
"require-dev": {
|
||||||
|
"symfony/css-selector": "^6.4|^7.0|^8.0"
|
||||||
|
},
|
||||||
|
"type": "library",
|
||||||
|
"autoload": {
|
||||||
|
"psr-4": {
|
||||||
|
"Symfony\\Component\\DomCrawler\\": ""
|
||||||
|
},
|
||||||
|
"exclude-from-classmap": [
|
||||||
|
"/Tests/"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"notification-url": "https://packagist.org/downloads/",
|
||||||
|
"license": [
|
||||||
|
"MIT"
|
||||||
|
],
|
||||||
|
"authors": [
|
||||||
|
{
|
||||||
|
"name": "Fabien Potencier",
|
||||||
|
"email": "fabien@symfony.com"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "Symfony Community",
|
||||||
|
"homepage": "https://symfony.com/contributors"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"description": "Eases DOM navigation for HTML and XML documents",
|
||||||
|
"homepage": "https://symfony.com",
|
||||||
|
"support": {
|
||||||
|
"source": "https://github.com/symfony/dom-crawler/tree/v7.4.8"
|
||||||
|
},
|
||||||
|
"funding": [
|
||||||
|
{
|
||||||
|
"url": "https://symfony.com/sponsor",
|
||||||
|
"type": "custom"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"url": "https://github.com/fabpot",
|
||||||
|
"type": "github"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"url": "https://github.com/nicolas-grekas",
|
||||||
|
"type": "github"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"url": "https://tidelift.com/funding/github/packagist/symfony/symfony",
|
||||||
|
"type": "tidelift"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"time": "2026-03-24T13:12:05+00:00"
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"name": "symfony/error-handler",
|
"name": "symfony/error-handler",
|
||||||
"version": "v7.4.8",
|
"version": "v7.4.8",
|
||||||
|
|
@ -8445,6 +8649,7 @@
|
||||||
"aliases": [],
|
"aliases": [],
|
||||||
"minimum-stability": "stable",
|
"minimum-stability": "stable",
|
||||||
"stability-flags": {
|
"stability-flags": {
|
||||||
|
"fivefilters/readability.php": 20,
|
||||||
"lvl0/fedi-discover": 20
|
"lvl0/fedi-discover": 20
|
||||||
},
|
},
|
||||||
"prefer-stable": true,
|
"prefer-stable": true,
|
||||||
|
|
@ -8453,5 +8658,5 @@
|
||||||
"php": "^8.3"
|
"php": "^8.3"
|
||||||
},
|
},
|
||||||
"platform-dev": {},
|
"platform-dev": {},
|
||||||
"plugin-api-version": "2.9.0"
|
"plugin-api-version": "2.6.0"
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -9,6 +9,7 @@
|
||||||
use App\ValueObjects\FetchResult;
|
use App\ValueObjects\FetchResult;
|
||||||
use GuzzleHttp\Exception\ConnectException;
|
use GuzzleHttp\Exception\ConnectException;
|
||||||
use GuzzleHttp\Psr7\Request;
|
use GuzzleHttp\Psr7\Request;
|
||||||
|
use Illuminate\Support\Collection;
|
||||||
use Illuminate\Support\Facades\Http;
|
use Illuminate\Support\Facades\Http;
|
||||||
use Tests\TestCase;
|
use Tests\TestCase;
|
||||||
|
|
||||||
|
|
@ -149,6 +150,116 @@ public function test_timeout_returns_timeout(): void
|
||||||
$this->assertIsString($result->errorMessage);
|
$this->assertIsString($result->errorMessage);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public function test_success_extracts_title_from_html(): void
|
||||||
|
{
|
||||||
|
Http::fake([
|
||||||
|
'example.com/*' => Http::response(
|
||||||
|
'<html><head><title>My Page Title</title></head><body><p>Some content.</p></body></html>',
|
||||||
|
200,
|
||||||
|
['Content-Type' => 'text/html'],
|
||||||
|
),
|
||||||
|
]);
|
||||||
|
|
||||||
|
$result = $this->makeAction()('https://example.com/page');
|
||||||
|
|
||||||
|
$this->assertSame(CrawlOutcomeEnum::Success, $result->outcome);
|
||||||
|
$this->assertSame('My Page Title', $result->title);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function test_success_extracts_main_text(): void
|
||||||
|
{
|
||||||
|
$html = <<<'HTML'
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<head><title>Article Title</title></head>
|
||||||
|
<body>
|
||||||
|
<nav>Navigation links</nav>
|
||||||
|
<article>
|
||||||
|
<h1>The Real Article</h1>
|
||||||
|
<p>This is the main article body that should be extracted by readability.</p>
|
||||||
|
<p>Multiple paragraphs prove the extractor works on the full content.</p>
|
||||||
|
</article>
|
||||||
|
<footer>Site footer noise</footer>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
HTML;
|
||||||
|
|
||||||
|
Http::fake([
|
||||||
|
'example.com/*' => Http::response($html, 200, ['Content-Type' => 'text/html']),
|
||||||
|
]);
|
||||||
|
|
||||||
|
$result = $this->makeAction()('https://example.com/article');
|
||||||
|
|
||||||
|
$this->assertSame(CrawlOutcomeEnum::Success, $result->outcome);
|
||||||
|
$this->assertNotNull($result->extractedText);
|
||||||
|
$this->assertStringContainsString('main article body', $result->extractedText);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function test_success_extracts_and_filters_outbound_links(): void
|
||||||
|
{
|
||||||
|
$html = <<<'HTML'
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<head><title>Article With Links</title></head>
|
||||||
|
<body>
|
||||||
|
<nav>
|
||||||
|
<a href="/home">Home (nav, should be filtered out by Readability scope)</a>
|
||||||
|
</nav>
|
||||||
|
<article>
|
||||||
|
<h1>Article Title</h1>
|
||||||
|
<p>This article references <a href="https://other.com/article">an external article</a>.</p>
|
||||||
|
<p>And a <a href="/related-post">relative link to a related post</a> on the same site.</p>
|
||||||
|
<p>Plus a <a href="http://192.168.1.1/admin">private IP link</a> that should be rejected.</p>
|
||||||
|
<p>And a <a href="https://user:pass@evil.com/">credentials URL</a> that should be rejected.</p>
|
||||||
|
<p>And a <a href="ftp://files.example.com/">non-http scheme</a> that should be rejected.</p>
|
||||||
|
</article>
|
||||||
|
<footer>
|
||||||
|
<a href="/privacy">Privacy (footer, filtered by Readability scope)</a>
|
||||||
|
</footer>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
HTML;
|
||||||
|
|
||||||
|
Http::fake([
|
||||||
|
'example.com/*' => Http::response($html, 200, ['Content-Type' => 'text/html']),
|
||||||
|
]);
|
||||||
|
|
||||||
|
$result = $this->makeAction()('https://example.com/article');
|
||||||
|
|
||||||
|
$this->assertSame(CrawlOutcomeEnum::Success, $result->outcome);
|
||||||
|
$this->assertInstanceOf(Collection::class, $result->outboundLinks);
|
||||||
|
$this->assertSame(2, $result->outboundLinks->count());
|
||||||
|
$this->assertContains('https://other.com/article', $result->outboundLinks->all());
|
||||||
|
$this->assertContains('https://example.com/related-post', $result->outboundLinks->all());
|
||||||
|
$this->assertNotContains('http://192.168.1.1/admin', $result->outboundLinks->all());
|
||||||
|
$this->assertNotContains('https://user:pass@evil.com/', $result->outboundLinks->all());
|
||||||
|
$this->assertNotContains('ftp://files.example.com/', $result->outboundLinks->all());
|
||||||
|
}
|
||||||
|
|
||||||
|
public function test_success_calculates_word_count(): void
|
||||||
|
{
|
||||||
|
$html = <<<'HTML'
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<head><title>Word Count Test</title></head>
|
||||||
|
<body>
|
||||||
|
<article>
|
||||||
|
<p>This article body has exactly nine words total here.</p>
|
||||||
|
</article>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
HTML;
|
||||||
|
|
||||||
|
Http::fake([
|
||||||
|
'example.com/*' => Http::response($html, 200, ['Content-Type' => 'text/html']),
|
||||||
|
]);
|
||||||
|
|
||||||
|
$result = $this->makeAction()('https://example.com/article');
|
||||||
|
|
||||||
|
$this->assertSame(CrawlOutcomeEnum::Success, $result->outcome);
|
||||||
|
$this->assertSame(9, $result->wordCount);
|
||||||
|
}
|
||||||
|
|
||||||
private function makeAction(): FetchPageAction
|
private function makeAction(): FetchPageAction
|
||||||
{
|
{
|
||||||
return app(FetchPageAction::class);
|
return app(FetchPageAction::class);
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue