12 - Apply pr-reviewer follow-ups: validation, link filters, readonly VO, docs

This commit is contained in:
myrmidex 2026-04-26 19:49:08 +02:00
parent 35e1147823
commit dda5b0f770
7 changed files with 126 additions and 45 deletions

View file

@ -22,6 +22,7 @@ class FetchPageAction
{ {
public function __construct( public function __construct(
private Factory $http, private Factory $http,
private UrlService $urlService,
) {} ) {}
public function __invoke(string $url): FetchResult public function __invoke(string $url): FetchResult
@ -46,7 +47,7 @@ public function __invoke(string $url): FetchResult
if ($outcome === CrawlOutcomeEnum::Success) { if ($outcome === CrawlOutcomeEnum::Success) {
[$title, $extractedText, $links] = $this->extractTitleTextAndLinks($response->body(), $url); [$title, $extractedText, $links] = $this->extractTitleTextAndLinks($response->body(), $url);
$wordCount = count(preg_split('/\s+/u', trim($extractedText))); $wordCount = $extractedText !== '' ? count(preg_split('/\s+/u', trim($extractedText))) : 0;
} }
return new FetchResult( return new FetchResult(
@ -64,18 +65,17 @@ public function __invoke(string $url): FetchResult
private function validateResponse(Response $response): array private function validateResponse(Response $response): array
{ {
$status = $response->status(); $status = $response->status();
$statusStart = substr((string) $status, 0, 1);
if ($statusStart === '4') { if ($status >= 400 && $status < 500) {
return [CrawlOutcomeEnum::Blocked4xx, "HTTP {$status}"]; return [CrawlOutcomeEnum::Blocked4xx, "HTTP {$status}"];
} }
if (str_starts_with((string) $status, '5')) { if ($status >= 500) {
return [CrawlOutcomeEnum::Blocked5xx, "HTTP {$status}"]; return [CrawlOutcomeEnum::Blocked5xx, "HTTP {$status}"];
} }
$contentType = $response->header('Content-Type'); $contentType = $response->header('Content-Type');
if (! str_starts_with($contentType, 'text/html')) { if (! str_starts_with(mb_strtolower($contentType), 'text/html')) {
return [CrawlOutcomeEnum::Rejected, "Unsupported file type: {$contentType}"]; return [CrawlOutcomeEnum::Rejected, "Unsupported file type: {$contentType}"];
} }
@ -147,8 +147,12 @@ private function resolveAndValidateLink(string $href, string $finalUrl): ?string
return null; return null;
} }
if ($resolved === $finalUrl) {
return null;
}
try { try {
app(UrlService::class)->host($resolved); $this->urlService->host($resolved);
} catch (InvalidArgumentException) { } catch (InvalidArgumentException) {
return null; return null;
} }

View file

@ -12,5 +12,12 @@ enum CrawlOutcomeEnum: string
case BlockedRobots = 'blocked_robots'; case BlockedRobots = 'blocked_robots';
case Blocked4xx = 'blocked_4xx'; case Blocked4xx = 'blocked_4xx';
case Blocked5xx = 'blocked_5xx'; case Blocked5xx = 'blocked_5xx';
/**
* The HTTP fetch succeeded (2xx) but the response is unindexable in v0.1
* (non-HTML Content-Type). Worker MUST also write `pages.status = Rejected`
* on this outcome do NOT treat as Failed. Page row STAYS in the DB to
* prevent re-discovery loops as fediverse re-shares the URL.
*/
case Rejected = 'rejected'; case Rejected = 'rejected';
} }

View file

@ -9,5 +9,12 @@ enum PageStatusEnum: string
case Discovered = 'discovered'; case Discovered = 'discovered';
case Fetched = 'fetched'; case Fetched = 'fetched';
case Failed = 'failed'; case Failed = 'failed';
/**
* The crawler fetched the page but rejected it as unindexable in v0.1
* (non-HTML Content-Type). Page row stays as a sentinel preventing
* re-discovery loops; future re-crawl could flip status back to
* Discovered Fetched if the URL starts serving HTML.
*/
case Rejected = 'rejected'; case Rejected = 'rejected';
} }

View file

@ -7,9 +7,10 @@
use App\Enums\CrawlOutcomeEnum; use App\Enums\CrawlOutcomeEnum;
use Illuminate\Support\Collection; use Illuminate\Support\Collection;
class FetchResult final readonly class FetchResult
{ {
/** /**
* @param ?string $finalUrl Set to the request URL in v0.1; true post-redirect URL tracking is deferred (see ticket #12 spec). Downstream consumers MUST NOT trust this field as the post-redirect location until that lands.
* @param Collection<int, string> $outboundLinks * @param Collection<int, string> $outboundLinks
*/ */
public function __construct( public function __construct(

View file

@ -16,7 +16,7 @@
], ],
"require": { "require": {
"php": "^8.3", "php": "^8.3",
"fivefilters/readability.php": "@dev", "fivefilters/readability.php": "^3.3",
"laravel/framework": "^13.0", "laravel/framework": "^13.0",
"laravel/tinker": "^3.0", "laravel/tinker": "^3.0",
"livewire/livewire": "^4.2", "livewire/livewire": "^4.2",

73
composer.lock generated
View file

@ -4,7 +4,7 @@
"Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies", "Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies",
"This file is @generated automatically" "This file is @generated automatically"
], ],
"content-hash": "30d45d9b30092cc20f9364f7c3828aa5", "content-hash": "2c63ed546b17b144997244f805e8a94a",
"packages": [ "packages": [
{ {
"name": "brick/math", "name": "brick/math",
@ -4620,7 +4620,7 @@
}, },
{ {
"name": "symfony/polyfill-ctype", "name": "symfony/polyfill-ctype",
"version": "v1.36.0", "version": "v1.37.0",
"source": { "source": {
"type": "git", "type": "git",
"url": "https://github.com/symfony/polyfill-ctype.git", "url": "https://github.com/symfony/polyfill-ctype.git",
@ -4679,7 +4679,7 @@
"portable" "portable"
], ],
"support": { "support": {
"source": "https://github.com/symfony/polyfill-ctype/tree/v1.36.0" "source": "https://github.com/symfony/polyfill-ctype/tree/v1.37.0"
}, },
"funding": [ "funding": [
{ {
@ -4703,16 +4703,16 @@
}, },
{ {
"name": "symfony/polyfill-intl-grapheme", "name": "symfony/polyfill-intl-grapheme",
"version": "v1.36.0", "version": "v1.37.0",
"source": { "source": {
"type": "git", "type": "git",
"url": "https://github.com/symfony/polyfill-intl-grapheme.git", "url": "https://github.com/symfony/polyfill-intl-grapheme.git",
"reference": "ad1b7b9092976d6c948b8a187cec9faaea9ec1df" "reference": "4864388bfbd3001ce88e234fab652acd91fdc57e"
}, },
"dist": { "dist": {
"type": "zip", "type": "zip",
"url": "https://api.github.com/repos/symfony/polyfill-intl-grapheme/zipball/ad1b7b9092976d6c948b8a187cec9faaea9ec1df", "url": "https://api.github.com/repos/symfony/polyfill-intl-grapheme/zipball/4864388bfbd3001ce88e234fab652acd91fdc57e",
"reference": "ad1b7b9092976d6c948b8a187cec9faaea9ec1df", "reference": "4864388bfbd3001ce88e234fab652acd91fdc57e",
"shasum": "" "shasum": ""
}, },
"require": { "require": {
@ -4761,7 +4761,7 @@
"shim" "shim"
], ],
"support": { "support": {
"source": "https://github.com/symfony/polyfill-intl-grapheme/tree/v1.36.0" "source": "https://github.com/symfony/polyfill-intl-grapheme/tree/v1.37.0"
}, },
"funding": [ "funding": [
{ {
@ -4781,11 +4781,11 @@
"type": "tidelift" "type": "tidelift"
} }
], ],
"time": "2026-04-10T16:19:22+00:00" "time": "2026-04-26T13:13:48+00:00"
}, },
{ {
"name": "symfony/polyfill-intl-idn", "name": "symfony/polyfill-intl-idn",
"version": "v1.36.0", "version": "v1.37.0",
"source": { "source": {
"type": "git", "type": "git",
"url": "https://github.com/symfony/polyfill-intl-idn.git", "url": "https://github.com/symfony/polyfill-intl-idn.git",
@ -4848,7 +4848,7 @@
"shim" "shim"
], ],
"support": { "support": {
"source": "https://github.com/symfony/polyfill-intl-idn/tree/v1.36.0" "source": "https://github.com/symfony/polyfill-intl-idn/tree/v1.37.0"
}, },
"funding": [ "funding": [
{ {
@ -4872,7 +4872,7 @@
}, },
{ {
"name": "symfony/polyfill-intl-normalizer", "name": "symfony/polyfill-intl-normalizer",
"version": "v1.36.0", "version": "v1.37.0",
"source": { "source": {
"type": "git", "type": "git",
"url": "https://github.com/symfony/polyfill-intl-normalizer.git", "url": "https://github.com/symfony/polyfill-intl-normalizer.git",
@ -4933,7 +4933,7 @@
"shim" "shim"
], ],
"support": { "support": {
"source": "https://github.com/symfony/polyfill-intl-normalizer/tree/v1.36.0" "source": "https://github.com/symfony/polyfill-intl-normalizer/tree/v1.37.0"
}, },
"funding": [ "funding": [
{ {
@ -4957,7 +4957,7 @@
}, },
{ {
"name": "symfony/polyfill-mbstring", "name": "symfony/polyfill-mbstring",
"version": "v1.36.0", "version": "v1.37.0",
"source": { "source": {
"type": "git", "type": "git",
"url": "https://github.com/symfony/polyfill-mbstring.git", "url": "https://github.com/symfony/polyfill-mbstring.git",
@ -5018,7 +5018,7 @@
"shim" "shim"
], ],
"support": { "support": {
"source": "https://github.com/symfony/polyfill-mbstring/tree/v1.36.0" "source": "https://github.com/symfony/polyfill-mbstring/tree/v1.37.0"
}, },
"funding": [ "funding": [
{ {
@ -5042,7 +5042,7 @@
}, },
{ {
"name": "symfony/polyfill-php80", "name": "symfony/polyfill-php80",
"version": "v1.36.0", "version": "v1.37.0",
"source": { "source": {
"type": "git", "type": "git",
"url": "https://github.com/symfony/polyfill-php80.git", "url": "https://github.com/symfony/polyfill-php80.git",
@ -5102,7 +5102,7 @@
"shim" "shim"
], ],
"support": { "support": {
"source": "https://github.com/symfony/polyfill-php80/tree/v1.36.0" "source": "https://github.com/symfony/polyfill-php80/tree/v1.37.0"
}, },
"funding": [ "funding": [
{ {
@ -5126,7 +5126,7 @@
}, },
{ {
"name": "symfony/polyfill-php83", "name": "symfony/polyfill-php83",
"version": "v1.36.0", "version": "v1.37.0",
"source": { "source": {
"type": "git", "type": "git",
"url": "https://github.com/symfony/polyfill-php83.git", "url": "https://github.com/symfony/polyfill-php83.git",
@ -5182,7 +5182,7 @@
"shim" "shim"
], ],
"support": { "support": {
"source": "https://github.com/symfony/polyfill-php83/tree/v1.36.0" "source": "https://github.com/symfony/polyfill-php83/tree/v1.37.0"
}, },
"funding": [ "funding": [
{ {
@ -5206,7 +5206,7 @@
}, },
{ {
"name": "symfony/polyfill-php84", "name": "symfony/polyfill-php84",
"version": "v1.36.0", "version": "v1.37.0",
"source": { "source": {
"type": "git", "type": "git",
"url": "https://github.com/symfony/polyfill-php84.git", "url": "https://github.com/symfony/polyfill-php84.git",
@ -5262,7 +5262,7 @@
"shim" "shim"
], ],
"support": { "support": {
"source": "https://github.com/symfony/polyfill-php84/tree/v1.36.0" "source": "https://github.com/symfony/polyfill-php84/tree/v1.37.0"
}, },
"funding": [ "funding": [
{ {
@ -5286,16 +5286,16 @@
}, },
{ {
"name": "symfony/polyfill-php85", "name": "symfony/polyfill-php85",
"version": "v1.36.0", "version": "v1.37.0",
"source": { "source": {
"type": "git", "type": "git",
"url": "https://github.com/symfony/polyfill-php85.git", "url": "https://github.com/symfony/polyfill-php85.git",
"reference": "2c408a6bb0313e6001a83628dc5506100474254e" "reference": "fcfa4973a9917cef23f2e38774da74a2b7d115ee"
}, },
"dist": { "dist": {
"type": "zip", "type": "zip",
"url": "https://api.github.com/repos/symfony/polyfill-php85/zipball/2c408a6bb0313e6001a83628dc5506100474254e", "url": "https://api.github.com/repos/symfony/polyfill-php85/zipball/fcfa4973a9917cef23f2e38774da74a2b7d115ee",
"reference": "2c408a6bb0313e6001a83628dc5506100474254e", "reference": "fcfa4973a9917cef23f2e38774da74a2b7d115ee",
"shasum": "" "shasum": ""
}, },
"require": { "require": {
@ -5342,7 +5342,7 @@
"shim" "shim"
], ],
"support": { "support": {
"source": "https://github.com/symfony/polyfill-php85/tree/v1.36.0" "source": "https://github.com/symfony/polyfill-php85/tree/v1.37.0"
}, },
"funding": [ "funding": [
{ {
@ -5362,11 +5362,11 @@
"type": "tidelift" "type": "tidelift"
} }
], ],
"time": "2026-04-10T16:50:15+00:00" "time": "2026-04-26T13:10:57+00:00"
}, },
{ {
"name": "symfony/polyfill-uuid", "name": "symfony/polyfill-uuid",
"version": "v1.36.0", "version": "v1.37.0",
"source": { "source": {
"type": "git", "type": "git",
"url": "https://github.com/symfony/polyfill-uuid.git", "url": "https://github.com/symfony/polyfill-uuid.git",
@ -5425,7 +5425,7 @@
"uuid" "uuid"
], ],
"support": { "support": {
"source": "https://github.com/symfony/polyfill-uuid/tree/v1.36.0" "source": "https://github.com/symfony/polyfill-uuid/tree/v1.37.0"
}, },
"funding": [ "funding": [
{ {
@ -6263,16 +6263,16 @@
}, },
{ {
"name": "voku/portable-ascii", "name": "voku/portable-ascii",
"version": "2.1.0", "version": "2.1.1",
"source": { "source": {
"type": "git", "type": "git",
"url": "https://github.com/voku/portable-ascii.git", "url": "https://github.com/voku/portable-ascii.git",
"reference": "d870a33f0f79d2b4579740b0620200221ee44aeb" "reference": "8e1051fe39379367aecf014f41744ce7539a856f"
}, },
"dist": { "dist": {
"type": "zip", "type": "zip",
"url": "https://api.github.com/repos/voku/portable-ascii/zipball/d870a33f0f79d2b4579740b0620200221ee44aeb", "url": "https://api.github.com/repos/voku/portable-ascii/zipball/8e1051fe39379367aecf014f41744ce7539a856f",
"reference": "d870a33f0f79d2b4579740b0620200221ee44aeb", "reference": "8e1051fe39379367aecf014f41744ce7539a856f",
"shasum": "" "shasum": ""
}, },
"require": { "require": {
@ -6309,7 +6309,7 @@
], ],
"support": { "support": {
"issues": "https://github.com/voku/portable-ascii/issues", "issues": "https://github.com/voku/portable-ascii/issues",
"source": "https://github.com/voku/portable-ascii/tree/2.1.0" "source": "https://github.com/voku/portable-ascii/tree/2.1.1"
}, },
"funding": [ "funding": [
{ {
@ -6333,7 +6333,7 @@
"type": "tidelift" "type": "tidelift"
} }
], ],
"time": "2026-04-16T23:10:39+00:00" "time": "2026-04-26T05:33:54+00:00"
} }
], ],
"packages-dev": [ "packages-dev": [
@ -8649,7 +8649,6 @@
"aliases": [], "aliases": [],
"minimum-stability": "stable", "minimum-stability": "stable",
"stability-flags": { "stability-flags": {
"fivefilters/readability.php": 20,
"lvl0/fedi-discover": 20 "lvl0/fedi-discover": 20
}, },
"prefer-stable": true, "prefer-stable": true,
@ -8658,5 +8657,5 @@
"php": "^8.3" "php": "^8.3"
}, },
"platform-dev": {}, "platform-dev": {},
"plugin-api-version": "2.6.0" "plugin-api-version": "2.9.0"
} }

View file

@ -260,6 +260,69 @@ public function test_success_calculates_word_count(): void
$this->assertSame(9, $result->wordCount); $this->assertSame(9, $result->wordCount);
} }
public function test_uppercase_content_type_is_accepted_as_html(): void
{
Http::fake([
'example.com/*' => Http::response(
'<html><head><title>Uppercase CT</title></head><body><p>Content here.</p></body></html>',
200,
['Content-Type' => 'Text/HTML; charset=utf-8'],
),
]);
$result = $this->makeAction()('https://example.com/page');
$this->assertSame(CrawlOutcomeEnum::Success, $result->outcome);
}
public function test_empty_href_is_filtered_from_outbound_links(): void
{
$html = <<<'HTML'
<!DOCTYPE html>
<html>
<head><title>Empty Href Test</title></head>
<body>
<article>
<p>This paragraph has <a href="">an empty href anchor</a> that should be dropped.</p>
</article>
</body>
</html>
HTML;
Http::fake([
'example.com/*' => Http::response($html, 200, ['Content-Type' => 'text/html']),
]);
$result = $this->makeAction()('https://example.com/article');
$this->assertSame(CrawlOutcomeEnum::Success, $result->outcome);
$this->assertSame(0, $result->outboundLinks->count());
}
public function test_fragment_only_href_is_filtered_from_outbound_links(): void
{
$html = <<<'HTML'
<!DOCTYPE html>
<html>
<head><title>Fragment Href Test</title></head>
<body>
<article>
<p>Jump to <a href="#section-2">section 2</a> of this page.</p>
</article>
</body>
</html>
HTML;
Http::fake([
'example.com/*' => Http::response($html, 200, ['Content-Type' => 'text/html']),
]);
$result = $this->makeAction()('https://example.com/article');
$this->assertSame(CrawlOutcomeEnum::Success, $result->outcome);
$this->assertSame(0, $result->outboundLinks->count());
}
private function makeAction(): FetchPageAction private function makeAction(): FetchPageAction
{ {
return app(FetchPageAction::class); return app(FetchPageAction::class);