12 - Add HTML content extraction (title, text, links, word count)

This commit is contained in:
myrmidex 2026-04-26 19:35:04 +02:00
parent 3e2fd0d2c4
commit 35e1147823
4 changed files with 386 additions and 8 deletions

View file

@ -5,11 +5,18 @@
namespace App\Actions; namespace App\Actions;
use App\Enums\CrawlOutcomeEnum; use App\Enums\CrawlOutcomeEnum;
use App\Services\UrlService;
use App\ValueObjects\FetchResult; use App\ValueObjects\FetchResult;
use fivefilters\Readability\Configuration;
use fivefilters\Readability\Readability;
use GuzzleHttp\Exception\ConnectException; use GuzzleHttp\Exception\ConnectException;
use Illuminate\Http\Client\ConnectionException; use Illuminate\Http\Client\ConnectionException;
use Illuminate\Http\Client\Factory; use Illuminate\Http\Client\Factory;
use Illuminate\Http\Client\Response; use Illuminate\Http\Client\Response;
use InvalidArgumentException;
use League\Uri\BaseUri;
use Symfony\Component\DomCrawler\Crawler;
use Throwable;
class FetchPageAction class FetchPageAction
{ {
@ -37,14 +44,19 @@ public function __invoke(string $url): FetchResult
[$outcome, $error] = $this->validateResponse($response); [$outcome, $error] = $this->validateResponse($response);
if ($outcome === CrawlOutcomeEnum::Success) {
[$title, $extractedText, $links] = $this->extractTitleTextAndLinks($response->body(), $url);
$wordCount = count(preg_split('/\s+/u', trim($extractedText)));
}
return new FetchResult( return new FetchResult(
outcome: $outcome, outcome: $outcome,
statusCode: $response->status(), statusCode: $response->status(),
finalUrl: $url, finalUrl: $url,
title: null, title: $title ?? null,
extractedText: null, extractedText: $extractedText ?? null,
outboundLinks: collect(), outboundLinks: $links ?? collect(),
wordCount: null, wordCount: $wordCount ?? null,
errorMessage: $error ?? null, errorMessage: $error ?? null,
); );
} }
@ -95,4 +107,52 @@ private function failureResult(ConnectionException|ConnectException $e): FetchRe
errorMessage: $e->getMessage(), errorMessage: $e->getMessage(),
); );
} }
private function extractTitleTextAndLinks(string $body, string $url): array
{
$crawler = new Crawler($body);
$title = $crawler->filter('title')->count() > 0
? trim($crawler->filter('title')->text())
: null;
$readability = new Readability(new Configuration);
$readability->parse($body);
$mainContent = $readability->getContent() ?? '';
$extractedText = trim(strip_tags($mainContent));
$links = collect();
if ($mainContent !== '') {
$linkCrawler = new Crawler($mainContent);
if ($linkCrawler->filter('a[href]')->count() > 0) {
$links = collect($linkCrawler->filter('a[href]')->extract(['href']));
}
}
$linksResolved = $links
->map(fn (string $href) => $this->resolveAndValidateLink($href, $url))
->filter()
->unique()
->values();
return [$title, $extractedText, $linksResolved];
}
private function resolveAndValidateLink(string $href, string $finalUrl): ?string
{
try {
$resolved = (string) BaseUri::from($finalUrl)->resolve($href);
$resolved = strstr($resolved, '#', true) ?: $resolved;
} catch (Throwable) {
return null;
}
try {
app(UrlService::class)->host($resolved);
} catch (InvalidArgumentException) {
return null;
}
return $resolved;
}
} }

View file

@ -16,10 +16,12 @@
], ],
"require": { "require": {
"php": "^8.3", "php": "^8.3",
"fivefilters/readability.php": "@dev",
"laravel/framework": "^13.0", "laravel/framework": "^13.0",
"laravel/tinker": "^3.0", "laravel/tinker": "^3.0",
"livewire/livewire": "^4.2", "livewire/livewire": "^4.2",
"lvl0/fedi-discover": "@dev" "lvl0/fedi-discover": "@dev",
"symfony/dom-crawler": "^7.4"
}, },
"require-dev": { "require-dev": {
"fakerphp/faker": "^1.23", "fakerphp/faker": "^1.23",

211
composer.lock generated
View file

@ -4,7 +4,7 @@
"Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies", "Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies",
"This file is @generated automatically" "This file is @generated automatically"
], ],
"content-hash": "e46e58784ec34415557c78db6bb6c97e", "content-hash": "30d45d9b30092cc20f9364f7c3828aa5",
"packages": [ "packages": [
{ {
"name": "brick/math", "name": "brick/math",
@ -508,6 +508,71 @@
], ],
"time": "2025-03-06T22:45:56+00:00" "time": "2025-03-06T22:45:56+00:00"
}, },
{
"name": "fivefilters/readability.php",
"version": "v3.3.3",
"source": {
"type": "git",
"url": "https://github.com/fivefilters/readability.php.git",
"reference": "e2ee7b9e49eae89ac7ed2c74b15718100a73b4c8"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/fivefilters/readability.php/zipball/e2ee7b9e49eae89ac7ed2c74b15718100a73b4c8",
"reference": "e2ee7b9e49eae89ac7ed2c74b15718100a73b4c8",
"shasum": ""
},
"require": {
"ext-dom": "*",
"ext-mbstring": "*",
"ext-xml": "*",
"league/uri": "^7.0",
"masterminds/html5": "^2.0",
"php": ">=8.1",
"psr/log": "^1.0 || ^2.0 || ^3.0"
},
"require-dev": {
"monolog/monolog": "^3.0",
"phpunit/phpunit": "^10.0 || ^11.0"
},
"suggest": {
"monolog/monolog": "Allow logging debug information"
},
"type": "library",
"autoload": {
"psr-4": {
"fivefilters\\Readability\\": "src/"
}
},
"notification-url": "https://packagist.org/downloads/",
"license": [
"Apache-2.0"
],
"authors": [
{
"name": "Andres Rey",
"email": "andreskrey@gmail.com",
"role": "Original Developer"
},
{
"name": "Keyvan Minoukadeh",
"email": "keyvan@fivefilters.org",
"homepage": "https://www.fivefilters.org",
"role": "Developer/Maintainer"
}
],
"description": "A PHP port of Readability.js",
"homepage": "https://github.com/fivefilters/readability.php",
"keywords": [
"html",
"readability"
],
"support": {
"issues": "https://github.com/fivefilters/readability.php/issues",
"source": "https://github.com/fivefilters/readability.php/tree/v3.3.3"
},
"time": "2025-04-26T23:45:37+00:00"
},
{ {
"name": "fruitcake/php-cors", "name": "fruitcake/php-cors",
"version": "v1.4.0", "version": "v1.4.0",
@ -2102,7 +2167,7 @@
}, },
{ {
"name": "lvl0/fedi-discover", "name": "lvl0/fedi-discover",
"version": "dev-main", "version": "dev-release/0.1.0",
"dist": { "dist": {
"type": "path", "type": "path",
"url": "packages/Lvl0/FediDiscover", "url": "packages/Lvl0/FediDiscover",
@ -2142,6 +2207,73 @@
"relative": true "relative": true
} }
}, },
{
"name": "masterminds/html5",
"version": "2.10.0",
"source": {
"type": "git",
"url": "https://github.com/Masterminds/html5-php.git",
"reference": "fcf91eb64359852f00d921887b219479b4f21251"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/Masterminds/html5-php/zipball/fcf91eb64359852f00d921887b219479b4f21251",
"reference": "fcf91eb64359852f00d921887b219479b4f21251",
"shasum": ""
},
"require": {
"ext-dom": "*",
"php": ">=5.3.0"
},
"require-dev": {
"phpunit/phpunit": "^4.8.35 || ^5.7.21 || ^6 || ^7 || ^8 || ^9"
},
"type": "library",
"extra": {
"branch-alias": {
"dev-master": "2.7-dev"
}
},
"autoload": {
"psr-4": {
"Masterminds\\": "src"
}
},
"notification-url": "https://packagist.org/downloads/",
"license": [
"MIT"
],
"authors": [
{
"name": "Matt Butcher",
"email": "technosophos@gmail.com"
},
{
"name": "Matt Farina",
"email": "matt@mattfarina.com"
},
{
"name": "Asmir Mustafic",
"email": "goetas@gmail.com"
}
],
"description": "An HTML5 parser and serializer.",
"homepage": "http://masterminds.github.io/html5-php",
"keywords": [
"HTML5",
"dom",
"html",
"parser",
"querypath",
"serializer",
"xml"
],
"support": {
"issues": "https://github.com/Masterminds/html5-php/issues",
"source": "https://github.com/Masterminds/html5-php/tree/2.10.0"
},
"time": "2025-07-25T09:04:22+00:00"
},
{ {
"name": "monolog/monolog", "name": "monolog/monolog",
"version": "3.10.0", "version": "3.10.0",
@ -3729,6 +3861,78 @@
], ],
"time": "2024-09-25T14:21:43+00:00" "time": "2024-09-25T14:21:43+00:00"
}, },
{
"name": "symfony/dom-crawler",
"version": "v7.4.8",
"source": {
"type": "git",
"url": "https://github.com/symfony/dom-crawler.git",
"reference": "2918e7c2ba964defca1f5b69c6f74886529e2dc8"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/symfony/dom-crawler/zipball/2918e7c2ba964defca1f5b69c6f74886529e2dc8",
"reference": "2918e7c2ba964defca1f5b69c6f74886529e2dc8",
"shasum": ""
},
"require": {
"masterminds/html5": "^2.6",
"php": ">=8.2",
"symfony/deprecation-contracts": "^2.5|^3",
"symfony/polyfill-ctype": "~1.8",
"symfony/polyfill-mbstring": "~1.0"
},
"require-dev": {
"symfony/css-selector": "^6.4|^7.0|^8.0"
},
"type": "library",
"autoload": {
"psr-4": {
"Symfony\\Component\\DomCrawler\\": ""
},
"exclude-from-classmap": [
"/Tests/"
]
},
"notification-url": "https://packagist.org/downloads/",
"license": [
"MIT"
],
"authors": [
{
"name": "Fabien Potencier",
"email": "fabien@symfony.com"
},
{
"name": "Symfony Community",
"homepage": "https://symfony.com/contributors"
}
],
"description": "Eases DOM navigation for HTML and XML documents",
"homepage": "https://symfony.com",
"support": {
"source": "https://github.com/symfony/dom-crawler/tree/v7.4.8"
},
"funding": [
{
"url": "https://symfony.com/sponsor",
"type": "custom"
},
{
"url": "https://github.com/fabpot",
"type": "github"
},
{
"url": "https://github.com/nicolas-grekas",
"type": "github"
},
{
"url": "https://tidelift.com/funding/github/packagist/symfony/symfony",
"type": "tidelift"
}
],
"time": "2026-03-24T13:12:05+00:00"
},
{ {
"name": "symfony/error-handler", "name": "symfony/error-handler",
"version": "v7.4.8", "version": "v7.4.8",
@ -8445,6 +8649,7 @@
"aliases": [], "aliases": [],
"minimum-stability": "stable", "minimum-stability": "stable",
"stability-flags": { "stability-flags": {
"fivefilters/readability.php": 20,
"lvl0/fedi-discover": 20 "lvl0/fedi-discover": 20
}, },
"prefer-stable": true, "prefer-stable": true,
@ -8453,5 +8658,5 @@
"php": "^8.3" "php": "^8.3"
}, },
"platform-dev": {}, "platform-dev": {},
"plugin-api-version": "2.9.0" "plugin-api-version": "2.6.0"
} }

View file

@ -9,6 +9,7 @@
use App\ValueObjects\FetchResult; use App\ValueObjects\FetchResult;
use GuzzleHttp\Exception\ConnectException; use GuzzleHttp\Exception\ConnectException;
use GuzzleHttp\Psr7\Request; use GuzzleHttp\Psr7\Request;
use Illuminate\Support\Collection;
use Illuminate\Support\Facades\Http; use Illuminate\Support\Facades\Http;
use Tests\TestCase; use Tests\TestCase;
@ -149,6 +150,116 @@ public function test_timeout_returns_timeout(): void
$this->assertIsString($result->errorMessage); $this->assertIsString($result->errorMessage);
} }
public function test_success_extracts_title_from_html(): void
{
Http::fake([
'example.com/*' => Http::response(
'<html><head><title>My Page Title</title></head><body><p>Some content.</p></body></html>',
200,
['Content-Type' => 'text/html'],
),
]);
$result = $this->makeAction()('https://example.com/page');
$this->assertSame(CrawlOutcomeEnum::Success, $result->outcome);
$this->assertSame('My Page Title', $result->title);
}
public function test_success_extracts_main_text(): void
{
$html = <<<'HTML'
<!DOCTYPE html>
<html>
<head><title>Article Title</title></head>
<body>
<nav>Navigation links</nav>
<article>
<h1>The Real Article</h1>
<p>This is the main article body that should be extracted by readability.</p>
<p>Multiple paragraphs prove the extractor works on the full content.</p>
</article>
<footer>Site footer noise</footer>
</body>
</html>
HTML;
Http::fake([
'example.com/*' => Http::response($html, 200, ['Content-Type' => 'text/html']),
]);
$result = $this->makeAction()('https://example.com/article');
$this->assertSame(CrawlOutcomeEnum::Success, $result->outcome);
$this->assertNotNull($result->extractedText);
$this->assertStringContainsString('main article body', $result->extractedText);
}
public function test_success_extracts_and_filters_outbound_links(): void
{
$html = <<<'HTML'
<!DOCTYPE html>
<html>
<head><title>Article With Links</title></head>
<body>
<nav>
<a href="/home">Home (nav, should be filtered out by Readability scope)</a>
</nav>
<article>
<h1>Article Title</h1>
<p>This article references <a href="https://other.com/article">an external article</a>.</p>
<p>And a <a href="/related-post">relative link to a related post</a> on the same site.</p>
<p>Plus a <a href="http://192.168.1.1/admin">private IP link</a> that should be rejected.</p>
<p>And a <a href="https://user:pass@evil.com/">credentials URL</a> that should be rejected.</p>
<p>And a <a href="ftp://files.example.com/">non-http scheme</a> that should be rejected.</p>
</article>
<footer>
<a href="/privacy">Privacy (footer, filtered by Readability scope)</a>
</footer>
</body>
</html>
HTML;
Http::fake([
'example.com/*' => Http::response($html, 200, ['Content-Type' => 'text/html']),
]);
$result = $this->makeAction()('https://example.com/article');
$this->assertSame(CrawlOutcomeEnum::Success, $result->outcome);
$this->assertInstanceOf(Collection::class, $result->outboundLinks);
$this->assertSame(2, $result->outboundLinks->count());
$this->assertContains('https://other.com/article', $result->outboundLinks->all());
$this->assertContains('https://example.com/related-post', $result->outboundLinks->all());
$this->assertNotContains('http://192.168.1.1/admin', $result->outboundLinks->all());
$this->assertNotContains('https://user:pass@evil.com/', $result->outboundLinks->all());
$this->assertNotContains('ftp://files.example.com/', $result->outboundLinks->all());
}
public function test_success_calculates_word_count(): void
{
$html = <<<'HTML'
<!DOCTYPE html>
<html>
<head><title>Word Count Test</title></head>
<body>
<article>
<p>This article body has exactly nine words total here.</p>
</article>
</body>
</html>
HTML;
Http::fake([
'example.com/*' => Http::response($html, 200, ['Content-Type' => 'text/html']),
]);
$result = $this->makeAction()('https://example.com/article');
$this->assertSame(CrawlOutcomeEnum::Success, $result->outcome);
$this->assertSame(9, $result->wordCount);
}
private function makeAction(): FetchPageAction private function makeAction(): FetchPageAction
{ {
return app(FetchPageAction::class); return app(FetchPageAction::class);