diff --git a/app/Actions/FetchPageAction.php b/app/Actions/FetchPageAction.php index 7b98142..ec92a8d 100644 --- a/app/Actions/FetchPageAction.php +++ b/app/Actions/FetchPageAction.php @@ -22,6 +22,7 @@ class FetchPageAction { public function __construct( private Factory $http, + private UrlService $urlService, ) {} public function __invoke(string $url): FetchResult @@ -46,7 +47,7 @@ public function __invoke(string $url): FetchResult if ($outcome === CrawlOutcomeEnum::Success) { [$title, $extractedText, $links] = $this->extractTitleTextAndLinks($response->body(), $url); - $wordCount = count(preg_split('/\s+/u', trim($extractedText))); + $wordCount = $extractedText !== '' ? count(preg_split('/\s+/u', trim($extractedText))) : 0; } return new FetchResult( @@ -64,18 +65,17 @@ public function __invoke(string $url): FetchResult private function validateResponse(Response $response): array { $status = $response->status(); - $statusStart = substr((string) $status, 0, 1); - if ($statusStart === '4') { + if ($status >= 400 && $status < 500) { return [CrawlOutcomeEnum::Blocked4xx, "HTTP {$status}"]; } - if (str_starts_with((string) $status, '5')) { + if ($status >= 500) { return [CrawlOutcomeEnum::Blocked5xx, "HTTP {$status}"]; } $contentType = $response->header('Content-Type'); - if (! str_starts_with($contentType, 'text/html')) { + if (! str_starts_with(mb_strtolower($contentType), 'text/html')) { return [CrawlOutcomeEnum::Rejected, "Unsupported file type: {$contentType}"]; } @@ -147,8 +147,12 @@ private function resolveAndValidateLink(string $href, string $finalUrl): ?string return null; } + if ($resolved === $finalUrl) { + return null; + } + try { - app(UrlService::class)->host($resolved); + $this->urlService->host($resolved); } catch (InvalidArgumentException) { return null; } diff --git a/app/Enums/CrawlOutcomeEnum.php b/app/Enums/CrawlOutcomeEnum.php index c4e912e..949cf69 100644 --- a/app/Enums/CrawlOutcomeEnum.php +++ b/app/Enums/CrawlOutcomeEnum.php @@ -12,5 +12,12 @@ enum CrawlOutcomeEnum: string case BlockedRobots = 'blocked_robots'; case Blocked4xx = 'blocked_4xx'; case Blocked5xx = 'blocked_5xx'; + + /** + * The HTTP fetch succeeded (2xx) but the response is unindexable in v0.1 + * (non-HTML Content-Type). Worker MUST also write `pages.status = Rejected` + * on this outcome — do NOT treat as Failed. Page row STAYS in the DB to + * prevent re-discovery loops as fediverse re-shares the URL. + */ case Rejected = 'rejected'; } diff --git a/app/Enums/PageStatusEnum.php b/app/Enums/PageStatusEnum.php index ed3abe0..84bee4c 100644 --- a/app/Enums/PageStatusEnum.php +++ b/app/Enums/PageStatusEnum.php @@ -9,5 +9,12 @@ enum PageStatusEnum: string case Discovered = 'discovered'; case Fetched = 'fetched'; case Failed = 'failed'; + + /** + * The crawler fetched the page but rejected it as unindexable in v0.1 + * (non-HTML Content-Type). Page row stays as a sentinel preventing + * re-discovery loops; future re-crawl could flip status back to + * Discovered → Fetched if the URL starts serving HTML. + */ case Rejected = 'rejected'; } diff --git a/app/ValueObjects/FetchResult.php b/app/ValueObjects/FetchResult.php index 86e04bb..d79cdae 100644 --- a/app/ValueObjects/FetchResult.php +++ b/app/ValueObjects/FetchResult.php @@ -7,9 +7,10 @@ use App\Enums\CrawlOutcomeEnum; use Illuminate\Support\Collection; -class FetchResult +final readonly class FetchResult { /** + * @param ?string $finalUrl Set to the request URL in v0.1; true post-redirect URL tracking is deferred (see ticket #12 spec). Downstream consumers MUST NOT trust this field as the post-redirect location until that lands. * @param Collection $outboundLinks */ public function __construct( diff --git a/composer.json b/composer.json index 8494562..de1ad17 100644 --- a/composer.json +++ b/composer.json @@ -16,7 +16,7 @@ ], "require": { "php": "^8.3", - "fivefilters/readability.php": "@dev", + "fivefilters/readability.php": "^3.3", "laravel/framework": "^13.0", "laravel/tinker": "^3.0", "livewire/livewire": "^4.2", diff --git a/composer.lock b/composer.lock index 06c83c4..e1fe116 100644 --- a/composer.lock +++ b/composer.lock @@ -4,7 +4,7 @@ "Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies", "This file is @generated automatically" ], - "content-hash": "30d45d9b30092cc20f9364f7c3828aa5", + "content-hash": "2c63ed546b17b144997244f805e8a94a", "packages": [ { "name": "brick/math", @@ -4620,7 +4620,7 @@ }, { "name": "symfony/polyfill-ctype", - "version": "v1.36.0", + "version": "v1.37.0", "source": { "type": "git", "url": "https://github.com/symfony/polyfill-ctype.git", @@ -4679,7 +4679,7 @@ "portable" ], "support": { - "source": "https://github.com/symfony/polyfill-ctype/tree/v1.36.0" + "source": "https://github.com/symfony/polyfill-ctype/tree/v1.37.0" }, "funding": [ { @@ -4703,16 +4703,16 @@ }, { "name": "symfony/polyfill-intl-grapheme", - "version": "v1.36.0", + "version": "v1.37.0", "source": { "type": "git", "url": "https://github.com/symfony/polyfill-intl-grapheme.git", - "reference": "ad1b7b9092976d6c948b8a187cec9faaea9ec1df" + "reference": "4864388bfbd3001ce88e234fab652acd91fdc57e" }, "dist": { "type": "zip", - "url": "https://api.github.com/repos/symfony/polyfill-intl-grapheme/zipball/ad1b7b9092976d6c948b8a187cec9faaea9ec1df", - "reference": "ad1b7b9092976d6c948b8a187cec9faaea9ec1df", + "url": "https://api.github.com/repos/symfony/polyfill-intl-grapheme/zipball/4864388bfbd3001ce88e234fab652acd91fdc57e", + "reference": "4864388bfbd3001ce88e234fab652acd91fdc57e", "shasum": "" }, "require": { @@ -4761,7 +4761,7 @@ "shim" ], "support": { - "source": "https://github.com/symfony/polyfill-intl-grapheme/tree/v1.36.0" + "source": "https://github.com/symfony/polyfill-intl-grapheme/tree/v1.37.0" }, "funding": [ { @@ -4781,11 +4781,11 @@ "type": "tidelift" } ], - "time": "2026-04-10T16:19:22+00:00" + "time": "2026-04-26T13:13:48+00:00" }, { "name": "symfony/polyfill-intl-idn", - "version": "v1.36.0", + "version": "v1.37.0", "source": { "type": "git", "url": "https://github.com/symfony/polyfill-intl-idn.git", @@ -4848,7 +4848,7 @@ "shim" ], "support": { - "source": "https://github.com/symfony/polyfill-intl-idn/tree/v1.36.0" + "source": "https://github.com/symfony/polyfill-intl-idn/tree/v1.37.0" }, "funding": [ { @@ -4872,7 +4872,7 @@ }, { "name": "symfony/polyfill-intl-normalizer", - "version": "v1.36.0", + "version": "v1.37.0", "source": { "type": "git", "url": "https://github.com/symfony/polyfill-intl-normalizer.git", @@ -4933,7 +4933,7 @@ "shim" ], "support": { - "source": "https://github.com/symfony/polyfill-intl-normalizer/tree/v1.36.0" + "source": "https://github.com/symfony/polyfill-intl-normalizer/tree/v1.37.0" }, "funding": [ { @@ -4957,7 +4957,7 @@ }, { "name": "symfony/polyfill-mbstring", - "version": "v1.36.0", + "version": "v1.37.0", "source": { "type": "git", "url": "https://github.com/symfony/polyfill-mbstring.git", @@ -5018,7 +5018,7 @@ "shim" ], "support": { - "source": "https://github.com/symfony/polyfill-mbstring/tree/v1.36.0" + "source": "https://github.com/symfony/polyfill-mbstring/tree/v1.37.0" }, "funding": [ { @@ -5042,7 +5042,7 @@ }, { "name": "symfony/polyfill-php80", - "version": "v1.36.0", + "version": "v1.37.0", "source": { "type": "git", "url": "https://github.com/symfony/polyfill-php80.git", @@ -5102,7 +5102,7 @@ "shim" ], "support": { - "source": "https://github.com/symfony/polyfill-php80/tree/v1.36.0" + "source": "https://github.com/symfony/polyfill-php80/tree/v1.37.0" }, "funding": [ { @@ -5126,7 +5126,7 @@ }, { "name": "symfony/polyfill-php83", - "version": "v1.36.0", + "version": "v1.37.0", "source": { "type": "git", "url": "https://github.com/symfony/polyfill-php83.git", @@ -5182,7 +5182,7 @@ "shim" ], "support": { - "source": "https://github.com/symfony/polyfill-php83/tree/v1.36.0" + "source": "https://github.com/symfony/polyfill-php83/tree/v1.37.0" }, "funding": [ { @@ -5206,7 +5206,7 @@ }, { "name": "symfony/polyfill-php84", - "version": "v1.36.0", + "version": "v1.37.0", "source": { "type": "git", "url": "https://github.com/symfony/polyfill-php84.git", @@ -5262,7 +5262,7 @@ "shim" ], "support": { - "source": "https://github.com/symfony/polyfill-php84/tree/v1.36.0" + "source": "https://github.com/symfony/polyfill-php84/tree/v1.37.0" }, "funding": [ { @@ -5286,16 +5286,16 @@ }, { "name": "symfony/polyfill-php85", - "version": "v1.36.0", + "version": "v1.37.0", "source": { "type": "git", "url": "https://github.com/symfony/polyfill-php85.git", - "reference": "2c408a6bb0313e6001a83628dc5506100474254e" + "reference": "fcfa4973a9917cef23f2e38774da74a2b7d115ee" }, "dist": { "type": "zip", - "url": "https://api.github.com/repos/symfony/polyfill-php85/zipball/2c408a6bb0313e6001a83628dc5506100474254e", - "reference": "2c408a6bb0313e6001a83628dc5506100474254e", + "url": "https://api.github.com/repos/symfony/polyfill-php85/zipball/fcfa4973a9917cef23f2e38774da74a2b7d115ee", + "reference": "fcfa4973a9917cef23f2e38774da74a2b7d115ee", "shasum": "" }, "require": { @@ -5342,7 +5342,7 @@ "shim" ], "support": { - "source": "https://github.com/symfony/polyfill-php85/tree/v1.36.0" + "source": "https://github.com/symfony/polyfill-php85/tree/v1.37.0" }, "funding": [ { @@ -5362,11 +5362,11 @@ "type": "tidelift" } ], - "time": "2026-04-10T16:50:15+00:00" + "time": "2026-04-26T13:10:57+00:00" }, { "name": "symfony/polyfill-uuid", - "version": "v1.36.0", + "version": "v1.37.0", "source": { "type": "git", "url": "https://github.com/symfony/polyfill-uuid.git", @@ -5425,7 +5425,7 @@ "uuid" ], "support": { - "source": "https://github.com/symfony/polyfill-uuid/tree/v1.36.0" + "source": "https://github.com/symfony/polyfill-uuid/tree/v1.37.0" }, "funding": [ { @@ -6263,16 +6263,16 @@ }, { "name": "voku/portable-ascii", - "version": "2.1.0", + "version": "2.1.1", "source": { "type": "git", "url": "https://github.com/voku/portable-ascii.git", - "reference": "d870a33f0f79d2b4579740b0620200221ee44aeb" + "reference": "8e1051fe39379367aecf014f41744ce7539a856f" }, "dist": { "type": "zip", - "url": "https://api.github.com/repos/voku/portable-ascii/zipball/d870a33f0f79d2b4579740b0620200221ee44aeb", - "reference": "d870a33f0f79d2b4579740b0620200221ee44aeb", + "url": "https://api.github.com/repos/voku/portable-ascii/zipball/8e1051fe39379367aecf014f41744ce7539a856f", + "reference": "8e1051fe39379367aecf014f41744ce7539a856f", "shasum": "" }, "require": { @@ -6309,7 +6309,7 @@ ], "support": { "issues": "https://github.com/voku/portable-ascii/issues", - "source": "https://github.com/voku/portable-ascii/tree/2.1.0" + "source": "https://github.com/voku/portable-ascii/tree/2.1.1" }, "funding": [ { @@ -6333,7 +6333,7 @@ "type": "tidelift" } ], - "time": "2026-04-16T23:10:39+00:00" + "time": "2026-04-26T05:33:54+00:00" } ], "packages-dev": [ @@ -8649,7 +8649,6 @@ "aliases": [], "minimum-stability": "stable", "stability-flags": { - "fivefilters/readability.php": 20, "lvl0/fedi-discover": 20 }, "prefer-stable": true, @@ -8658,5 +8657,5 @@ "php": "^8.3" }, "platform-dev": {}, - "plugin-api-version": "2.6.0" + "plugin-api-version": "2.9.0" } diff --git a/tests/Feature/Actions/FetchPageActionTest.php b/tests/Feature/Actions/FetchPageActionTest.php index 6925e96..b5f415a 100644 --- a/tests/Feature/Actions/FetchPageActionTest.php +++ b/tests/Feature/Actions/FetchPageActionTest.php @@ -260,6 +260,69 @@ public function test_success_calculates_word_count(): void $this->assertSame(9, $result->wordCount); } + public function test_uppercase_content_type_is_accepted_as_html(): void + { + Http::fake([ + 'example.com/*' => Http::response( + 'Uppercase CT

Content here.

', + 200, + ['Content-Type' => 'Text/HTML; charset=utf-8'], + ), + ]); + + $result = $this->makeAction()('https://example.com/page'); + + $this->assertSame(CrawlOutcomeEnum::Success, $result->outcome); + } + + public function test_empty_href_is_filtered_from_outbound_links(): void + { + $html = <<<'HTML' + + + Empty Href Test + +
+

This paragraph has an empty href anchor that should be dropped.

+
+ + + HTML; + + Http::fake([ + 'example.com/*' => Http::response($html, 200, ['Content-Type' => 'text/html']), + ]); + + $result = $this->makeAction()('https://example.com/article'); + + $this->assertSame(CrawlOutcomeEnum::Success, $result->outcome); + $this->assertSame(0, $result->outboundLinks->count()); + } + + public function test_fragment_only_href_is_filtered_from_outbound_links(): void + { + $html = <<<'HTML' + + + Fragment Href Test + +
+

Jump to section 2 of this page.

+
+ + + HTML; + + Http::fake([ + 'example.com/*' => Http::response($html, 200, ['Content-Type' => 'text/html']), + ]); + + $result = $this->makeAction()('https://example.com/article'); + + $this->assertSame(CrawlOutcomeEnum::Success, $result->outcome); + $this->assertSame(0, $result->outboundLinks->count()); + } + private function makeAction(): FetchPageAction { return app(FetchPageAction::class);