diff --git a/app/Actions/FetchPageAction.php b/app/Actions/FetchPageAction.php index 9a69018..7b98142 100644 --- a/app/Actions/FetchPageAction.php +++ b/app/Actions/FetchPageAction.php @@ -5,11 +5,18 @@ namespace App\Actions; use App\Enums\CrawlOutcomeEnum; +use App\Services\UrlService; use App\ValueObjects\FetchResult; +use fivefilters\Readability\Configuration; +use fivefilters\Readability\Readability; use GuzzleHttp\Exception\ConnectException; use Illuminate\Http\Client\ConnectionException; use Illuminate\Http\Client\Factory; use Illuminate\Http\Client\Response; +use InvalidArgumentException; +use League\Uri\BaseUri; +use Symfony\Component\DomCrawler\Crawler; +use Throwable; class FetchPageAction { @@ -37,14 +44,19 @@ public function __invoke(string $url): FetchResult [$outcome, $error] = $this->validateResponse($response); + if ($outcome === CrawlOutcomeEnum::Success) { + [$title, $extractedText, $links] = $this->extractTitleTextAndLinks($response->body(), $url); + $wordCount = count(preg_split('/\s+/u', trim($extractedText))); + } + return new FetchResult( outcome: $outcome, statusCode: $response->status(), finalUrl: $url, - title: null, - extractedText: null, - outboundLinks: collect(), - wordCount: null, + title: $title ?? null, + extractedText: $extractedText ?? null, + outboundLinks: $links ?? collect(), + wordCount: $wordCount ?? null, errorMessage: $error ?? null, ); } @@ -95,4 +107,52 @@ private function failureResult(ConnectionException|ConnectException $e): FetchRe errorMessage: $e->getMessage(), ); } + + private function extractTitleTextAndLinks(string $body, string $url): array + { + $crawler = new Crawler($body); + + $title = $crawler->filter('title')->count() > 0 + ? trim($crawler->filter('title')->text()) + : null; + + $readability = new Readability(new Configuration); + $readability->parse($body); + $mainContent = $readability->getContent() ?? ''; + $extractedText = trim(strip_tags($mainContent)); + + $links = collect(); + if ($mainContent !== '') { + $linkCrawler = new Crawler($mainContent); + if ($linkCrawler->filter('a[href]')->count() > 0) { + $links = collect($linkCrawler->filter('a[href]')->extract(['href'])); + } + } + + $linksResolved = $links + ->map(fn (string $href) => $this->resolveAndValidateLink($href, $url)) + ->filter() + ->unique() + ->values(); + + return [$title, $extractedText, $linksResolved]; + } + + private function resolveAndValidateLink(string $href, string $finalUrl): ?string + { + try { + $resolved = (string) BaseUri::from($finalUrl)->resolve($href); + $resolved = strstr($resolved, '#', true) ?: $resolved; + } catch (Throwable) { + return null; + } + + try { + app(UrlService::class)->host($resolved); + } catch (InvalidArgumentException) { + return null; + } + + return $resolved; + } } diff --git a/composer.json b/composer.json index dcb3aca..8494562 100644 --- a/composer.json +++ b/composer.json @@ -16,10 +16,12 @@ ], "require": { "php": "^8.3", + "fivefilters/readability.php": "@dev", "laravel/framework": "^13.0", "laravel/tinker": "^3.0", "livewire/livewire": "^4.2", - "lvl0/fedi-discover": "@dev" + "lvl0/fedi-discover": "@dev", + "symfony/dom-crawler": "^7.4" }, "require-dev": { "fakerphp/faker": "^1.23", diff --git a/composer.lock b/composer.lock index 15b7993..06c83c4 100644 --- a/composer.lock +++ b/composer.lock @@ -4,7 +4,7 @@ "Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies", "This file is @generated automatically" ], - "content-hash": "e46e58784ec34415557c78db6bb6c97e", + "content-hash": "30d45d9b30092cc20f9364f7c3828aa5", "packages": [ { "name": "brick/math", @@ -508,6 +508,71 @@ ], "time": "2025-03-06T22:45:56+00:00" }, + { + "name": "fivefilters/readability.php", + "version": "v3.3.3", + "source": { + "type": "git", + "url": "https://github.com/fivefilters/readability.php.git", + "reference": "e2ee7b9e49eae89ac7ed2c74b15718100a73b4c8" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/fivefilters/readability.php/zipball/e2ee7b9e49eae89ac7ed2c74b15718100a73b4c8", + "reference": "e2ee7b9e49eae89ac7ed2c74b15718100a73b4c8", + "shasum": "" + }, + "require": { + "ext-dom": "*", + "ext-mbstring": "*", + "ext-xml": "*", + "league/uri": "^7.0", + "masterminds/html5": "^2.0", + "php": ">=8.1", + "psr/log": "^1.0 || ^2.0 || ^3.0" + }, + "require-dev": { + "monolog/monolog": "^3.0", + "phpunit/phpunit": "^10.0 || ^11.0" + }, + "suggest": { + "monolog/monolog": "Allow logging debug information" + }, + "type": "library", + "autoload": { + "psr-4": { + "fivefilters\\Readability\\": "src/" + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "Apache-2.0" + ], + "authors": [ + { + "name": "Andres Rey", + "email": "andreskrey@gmail.com", + "role": "Original Developer" + }, + { + "name": "Keyvan Minoukadeh", + "email": "keyvan@fivefilters.org", + "homepage": "https://www.fivefilters.org", + "role": "Developer/Maintainer" + } + ], + "description": "A PHP port of Readability.js", + "homepage": "https://github.com/fivefilters/readability.php", + "keywords": [ + "html", + "readability" + ], + "support": { + "issues": "https://github.com/fivefilters/readability.php/issues", + "source": "https://github.com/fivefilters/readability.php/tree/v3.3.3" + }, + "time": "2025-04-26T23:45:37+00:00" + }, { "name": "fruitcake/php-cors", "version": "v1.4.0", @@ -2102,7 +2167,7 @@ }, { "name": "lvl0/fedi-discover", - "version": "dev-main", + "version": "dev-release/0.1.0", "dist": { "type": "path", "url": "packages/Lvl0/FediDiscover", @@ -2142,6 +2207,73 @@ "relative": true } }, + { + "name": "masterminds/html5", + "version": "2.10.0", + "source": { + "type": "git", + "url": "https://github.com/Masterminds/html5-php.git", + "reference": "fcf91eb64359852f00d921887b219479b4f21251" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/Masterminds/html5-php/zipball/fcf91eb64359852f00d921887b219479b4f21251", + "reference": "fcf91eb64359852f00d921887b219479b4f21251", + "shasum": "" + }, + "require": { + "ext-dom": "*", + "php": ">=5.3.0" + }, + "require-dev": { + "phpunit/phpunit": "^4.8.35 || ^5.7.21 || ^6 || ^7 || ^8 || ^9" + }, + "type": "library", + "extra": { + "branch-alias": { + "dev-master": "2.7-dev" + } + }, + "autoload": { + "psr-4": { + "Masterminds\\": "src" + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "Matt Butcher", + "email": "technosophos@gmail.com" + }, + { + "name": "Matt Farina", + "email": "matt@mattfarina.com" + }, + { + "name": "Asmir Mustafic", + "email": "goetas@gmail.com" + } + ], + "description": "An HTML5 parser and serializer.", + "homepage": "http://masterminds.github.io/html5-php", + "keywords": [ + "HTML5", + "dom", + "html", + "parser", + "querypath", + "serializer", + "xml" + ], + "support": { + "issues": "https://github.com/Masterminds/html5-php/issues", + "source": "https://github.com/Masterminds/html5-php/tree/2.10.0" + }, + "time": "2025-07-25T09:04:22+00:00" + }, { "name": "monolog/monolog", "version": "3.10.0", @@ -3729,6 +3861,78 @@ ], "time": "2024-09-25T14:21:43+00:00" }, + { + "name": "symfony/dom-crawler", + "version": "v7.4.8", + "source": { + "type": "git", + "url": "https://github.com/symfony/dom-crawler.git", + "reference": "2918e7c2ba964defca1f5b69c6f74886529e2dc8" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/symfony/dom-crawler/zipball/2918e7c2ba964defca1f5b69c6f74886529e2dc8", + "reference": "2918e7c2ba964defca1f5b69c6f74886529e2dc8", + "shasum": "" + }, + "require": { + "masterminds/html5": "^2.6", + "php": ">=8.2", + "symfony/deprecation-contracts": "^2.5|^3", + "symfony/polyfill-ctype": "~1.8", + "symfony/polyfill-mbstring": "~1.0" + }, + "require-dev": { + "symfony/css-selector": "^6.4|^7.0|^8.0" + }, + "type": "library", + "autoload": { + "psr-4": { + "Symfony\\Component\\DomCrawler\\": "" + }, + "exclude-from-classmap": [ + "/Tests/" + ] + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "Fabien Potencier", + "email": "fabien@symfony.com" + }, + { + "name": "Symfony Community", + "homepage": "https://symfony.com/contributors" + } + ], + "description": "Eases DOM navigation for HTML and XML documents", + "homepage": "https://symfony.com", + "support": { + "source": "https://github.com/symfony/dom-crawler/tree/v7.4.8" + }, + "funding": [ + { + "url": "https://symfony.com/sponsor", + "type": "custom" + }, + { + "url": "https://github.com/fabpot", + "type": "github" + }, + { + "url": "https://github.com/nicolas-grekas", + "type": "github" + }, + { + "url": "https://tidelift.com/funding/github/packagist/symfony/symfony", + "type": "tidelift" + } + ], + "time": "2026-03-24T13:12:05+00:00" + }, { "name": "symfony/error-handler", "version": "v7.4.8", @@ -8445,6 +8649,7 @@ "aliases": [], "minimum-stability": "stable", "stability-flags": { + "fivefilters/readability.php": 20, "lvl0/fedi-discover": 20 }, "prefer-stable": true, @@ -8453,5 +8658,5 @@ "php": "^8.3" }, "platform-dev": {}, - "plugin-api-version": "2.9.0" + "plugin-api-version": "2.6.0" } diff --git a/tests/Feature/Actions/FetchPageActionTest.php b/tests/Feature/Actions/FetchPageActionTest.php index 1b399a8..6925e96 100644 --- a/tests/Feature/Actions/FetchPageActionTest.php +++ b/tests/Feature/Actions/FetchPageActionTest.php @@ -9,6 +9,7 @@ use App\ValueObjects\FetchResult; use GuzzleHttp\Exception\ConnectException; use GuzzleHttp\Psr7\Request; +use Illuminate\Support\Collection; use Illuminate\Support\Facades\Http; use Tests\TestCase; @@ -149,6 +150,116 @@ public function test_timeout_returns_timeout(): void $this->assertIsString($result->errorMessage); } + public function test_success_extracts_title_from_html(): void + { + Http::fake([ + 'example.com/*' => Http::response( + 'My Page Title

Some content.

', + 200, + ['Content-Type' => 'text/html'], + ), + ]); + + $result = $this->makeAction()('https://example.com/page'); + + $this->assertSame(CrawlOutcomeEnum::Success, $result->outcome); + $this->assertSame('My Page Title', $result->title); + } + + public function test_success_extracts_main_text(): void + { + $html = <<<'HTML' + + + Article Title + + +
+

The Real Article

+

This is the main article body that should be extracted by readability.

+

Multiple paragraphs prove the extractor works on the full content.

+
+ + + + HTML; + + Http::fake([ + 'example.com/*' => Http::response($html, 200, ['Content-Type' => 'text/html']), + ]); + + $result = $this->makeAction()('https://example.com/article'); + + $this->assertSame(CrawlOutcomeEnum::Success, $result->outcome); + $this->assertNotNull($result->extractedText); + $this->assertStringContainsString('main article body', $result->extractedText); + } + + public function test_success_extracts_and_filters_outbound_links(): void + { + $html = <<<'HTML' + + + Article With Links + + +
+

Article Title

+

This article references an external article.

+

And a relative link to a related post on the same site.

+

Plus a private IP link that should be rejected.

+

And a credentials URL that should be rejected.

+

And a non-http scheme that should be rejected.

+
+ + + + HTML; + + Http::fake([ + 'example.com/*' => Http::response($html, 200, ['Content-Type' => 'text/html']), + ]); + + $result = $this->makeAction()('https://example.com/article'); + + $this->assertSame(CrawlOutcomeEnum::Success, $result->outcome); + $this->assertInstanceOf(Collection::class, $result->outboundLinks); + $this->assertSame(2, $result->outboundLinks->count()); + $this->assertContains('https://other.com/article', $result->outboundLinks->all()); + $this->assertContains('https://example.com/related-post', $result->outboundLinks->all()); + $this->assertNotContains('http://192.168.1.1/admin', $result->outboundLinks->all()); + $this->assertNotContains('https://user:pass@evil.com/', $result->outboundLinks->all()); + $this->assertNotContains('ftp://files.example.com/', $result->outboundLinks->all()); + } + + public function test_success_calculates_word_count(): void + { + $html = <<<'HTML' + + + Word Count Test + +
+

This article body has exactly nine words total here.

+
+ + + HTML; + + Http::fake([ + 'example.com/*' => Http::response($html, 200, ['Content-Type' => 'text/html']), + ]); + + $result = $this->makeAction()('https://example.com/article'); + + $this->assertSame(CrawlOutcomeEnum::Success, $result->outcome); + $this->assertSame(9, $result->wordCount); + } + private function makeAction(): FetchPageAction { return app(FetchPageAction::class);