12 - Apply pr-reviewer follow-ups: validation, link filters, readonly VO, docs
This commit is contained in:
parent
35e1147823
commit
dda5b0f770
7 changed files with 126 additions and 45 deletions
|
|
@ -22,6 +22,7 @@ class FetchPageAction
|
|||
{
|
||||
public function __construct(
|
||||
private Factory $http,
|
||||
private UrlService $urlService,
|
||||
) {}
|
||||
|
||||
public function __invoke(string $url): FetchResult
|
||||
|
|
@ -46,7 +47,7 @@ public function __invoke(string $url): FetchResult
|
|||
|
||||
if ($outcome === CrawlOutcomeEnum::Success) {
|
||||
[$title, $extractedText, $links] = $this->extractTitleTextAndLinks($response->body(), $url);
|
||||
$wordCount = count(preg_split('/\s+/u', trim($extractedText)));
|
||||
$wordCount = $extractedText !== '' ? count(preg_split('/\s+/u', trim($extractedText))) : 0;
|
||||
}
|
||||
|
||||
return new FetchResult(
|
||||
|
|
@ -64,18 +65,17 @@ public function __invoke(string $url): FetchResult
|
|||
private function validateResponse(Response $response): array
|
||||
{
|
||||
$status = $response->status();
|
||||
$statusStart = substr((string) $status, 0, 1);
|
||||
|
||||
if ($statusStart === '4') {
|
||||
if ($status >= 400 && $status < 500) {
|
||||
return [CrawlOutcomeEnum::Blocked4xx, "HTTP {$status}"];
|
||||
}
|
||||
|
||||
if (str_starts_with((string) $status, '5')) {
|
||||
if ($status >= 500) {
|
||||
return [CrawlOutcomeEnum::Blocked5xx, "HTTP {$status}"];
|
||||
}
|
||||
|
||||
$contentType = $response->header('Content-Type');
|
||||
if (! str_starts_with($contentType, 'text/html')) {
|
||||
if (! str_starts_with(mb_strtolower($contentType), 'text/html')) {
|
||||
return [CrawlOutcomeEnum::Rejected, "Unsupported file type: {$contentType}"];
|
||||
}
|
||||
|
||||
|
|
@ -147,8 +147,12 @@ private function resolveAndValidateLink(string $href, string $finalUrl): ?string
|
|||
return null;
|
||||
}
|
||||
|
||||
if ($resolved === $finalUrl) {
|
||||
return null;
|
||||
}
|
||||
|
||||
try {
|
||||
app(UrlService::class)->host($resolved);
|
||||
$this->urlService->host($resolved);
|
||||
} catch (InvalidArgumentException) {
|
||||
return null;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -12,5 +12,12 @@ enum CrawlOutcomeEnum: string
|
|||
case BlockedRobots = 'blocked_robots';
|
||||
case Blocked4xx = 'blocked_4xx';
|
||||
case Blocked5xx = 'blocked_5xx';
|
||||
|
||||
/**
|
||||
* The HTTP fetch succeeded (2xx) but the response is unindexable in v0.1
|
||||
* (non-HTML Content-Type). Worker MUST also write `pages.status = Rejected`
|
||||
* on this outcome — do NOT treat as Failed. Page row STAYS in the DB to
|
||||
* prevent re-discovery loops as fediverse re-shares the URL.
|
||||
*/
|
||||
case Rejected = 'rejected';
|
||||
}
|
||||
|
|
|
|||
|
|
@ -9,5 +9,12 @@ enum PageStatusEnum: string
|
|||
case Discovered = 'discovered';
|
||||
case Fetched = 'fetched';
|
||||
case Failed = 'failed';
|
||||
|
||||
/**
|
||||
* The crawler fetched the page but rejected it as unindexable in v0.1
|
||||
* (non-HTML Content-Type). Page row stays as a sentinel preventing
|
||||
* re-discovery loops; future re-crawl could flip status back to
|
||||
* Discovered → Fetched if the URL starts serving HTML.
|
||||
*/
|
||||
case Rejected = 'rejected';
|
||||
}
|
||||
|
|
|
|||
|
|
@ -7,9 +7,10 @@
|
|||
use App\Enums\CrawlOutcomeEnum;
|
||||
use Illuminate\Support\Collection;
|
||||
|
||||
class FetchResult
|
||||
final readonly class FetchResult
|
||||
{
|
||||
/**
|
||||
* @param ?string $finalUrl Set to the request URL in v0.1; true post-redirect URL tracking is deferred (see ticket #12 spec). Downstream consumers MUST NOT trust this field as the post-redirect location until that lands.
|
||||
* @param Collection<int, string> $outboundLinks
|
||||
*/
|
||||
public function __construct(
|
||||
|
|
|
|||
|
|
@ -16,7 +16,7 @@
|
|||
],
|
||||
"require": {
|
||||
"php": "^8.3",
|
||||
"fivefilters/readability.php": "@dev",
|
||||
"fivefilters/readability.php": "^3.3",
|
||||
"laravel/framework": "^13.0",
|
||||
"laravel/tinker": "^3.0",
|
||||
"livewire/livewire": "^4.2",
|
||||
|
|
|
|||
73
composer.lock
generated
73
composer.lock
generated
|
|
@ -4,7 +4,7 @@
|
|||
"Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies",
|
||||
"This file is @generated automatically"
|
||||
],
|
||||
"content-hash": "30d45d9b30092cc20f9364f7c3828aa5",
|
||||
"content-hash": "2c63ed546b17b144997244f805e8a94a",
|
||||
"packages": [
|
||||
{
|
||||
"name": "brick/math",
|
||||
|
|
@ -4620,7 +4620,7 @@
|
|||
},
|
||||
{
|
||||
"name": "symfony/polyfill-ctype",
|
||||
"version": "v1.36.0",
|
||||
"version": "v1.37.0",
|
||||
"source": {
|
||||
"type": "git",
|
||||
"url": "https://github.com/symfony/polyfill-ctype.git",
|
||||
|
|
@ -4679,7 +4679,7 @@
|
|||
"portable"
|
||||
],
|
||||
"support": {
|
||||
"source": "https://github.com/symfony/polyfill-ctype/tree/v1.36.0"
|
||||
"source": "https://github.com/symfony/polyfill-ctype/tree/v1.37.0"
|
||||
},
|
||||
"funding": [
|
||||
{
|
||||
|
|
@ -4703,16 +4703,16 @@
|
|||
},
|
||||
{
|
||||
"name": "symfony/polyfill-intl-grapheme",
|
||||
"version": "v1.36.0",
|
||||
"version": "v1.37.0",
|
||||
"source": {
|
||||
"type": "git",
|
||||
"url": "https://github.com/symfony/polyfill-intl-grapheme.git",
|
||||
"reference": "ad1b7b9092976d6c948b8a187cec9faaea9ec1df"
|
||||
"reference": "4864388bfbd3001ce88e234fab652acd91fdc57e"
|
||||
},
|
||||
"dist": {
|
||||
"type": "zip",
|
||||
"url": "https://api.github.com/repos/symfony/polyfill-intl-grapheme/zipball/ad1b7b9092976d6c948b8a187cec9faaea9ec1df",
|
||||
"reference": "ad1b7b9092976d6c948b8a187cec9faaea9ec1df",
|
||||
"url": "https://api.github.com/repos/symfony/polyfill-intl-grapheme/zipball/4864388bfbd3001ce88e234fab652acd91fdc57e",
|
||||
"reference": "4864388bfbd3001ce88e234fab652acd91fdc57e",
|
||||
"shasum": ""
|
||||
},
|
||||
"require": {
|
||||
|
|
@ -4761,7 +4761,7 @@
|
|||
"shim"
|
||||
],
|
||||
"support": {
|
||||
"source": "https://github.com/symfony/polyfill-intl-grapheme/tree/v1.36.0"
|
||||
"source": "https://github.com/symfony/polyfill-intl-grapheme/tree/v1.37.0"
|
||||
},
|
||||
"funding": [
|
||||
{
|
||||
|
|
@ -4781,11 +4781,11 @@
|
|||
"type": "tidelift"
|
||||
}
|
||||
],
|
||||
"time": "2026-04-10T16:19:22+00:00"
|
||||
"time": "2026-04-26T13:13:48+00:00"
|
||||
},
|
||||
{
|
||||
"name": "symfony/polyfill-intl-idn",
|
||||
"version": "v1.36.0",
|
||||
"version": "v1.37.0",
|
||||
"source": {
|
||||
"type": "git",
|
||||
"url": "https://github.com/symfony/polyfill-intl-idn.git",
|
||||
|
|
@ -4848,7 +4848,7 @@
|
|||
"shim"
|
||||
],
|
||||
"support": {
|
||||
"source": "https://github.com/symfony/polyfill-intl-idn/tree/v1.36.0"
|
||||
"source": "https://github.com/symfony/polyfill-intl-idn/tree/v1.37.0"
|
||||
},
|
||||
"funding": [
|
||||
{
|
||||
|
|
@ -4872,7 +4872,7 @@
|
|||
},
|
||||
{
|
||||
"name": "symfony/polyfill-intl-normalizer",
|
||||
"version": "v1.36.0",
|
||||
"version": "v1.37.0",
|
||||
"source": {
|
||||
"type": "git",
|
||||
"url": "https://github.com/symfony/polyfill-intl-normalizer.git",
|
||||
|
|
@ -4933,7 +4933,7 @@
|
|||
"shim"
|
||||
],
|
||||
"support": {
|
||||
"source": "https://github.com/symfony/polyfill-intl-normalizer/tree/v1.36.0"
|
||||
"source": "https://github.com/symfony/polyfill-intl-normalizer/tree/v1.37.0"
|
||||
},
|
||||
"funding": [
|
||||
{
|
||||
|
|
@ -4957,7 +4957,7 @@
|
|||
},
|
||||
{
|
||||
"name": "symfony/polyfill-mbstring",
|
||||
"version": "v1.36.0",
|
||||
"version": "v1.37.0",
|
||||
"source": {
|
||||
"type": "git",
|
||||
"url": "https://github.com/symfony/polyfill-mbstring.git",
|
||||
|
|
@ -5018,7 +5018,7 @@
|
|||
"shim"
|
||||
],
|
||||
"support": {
|
||||
"source": "https://github.com/symfony/polyfill-mbstring/tree/v1.36.0"
|
||||
"source": "https://github.com/symfony/polyfill-mbstring/tree/v1.37.0"
|
||||
},
|
||||
"funding": [
|
||||
{
|
||||
|
|
@ -5042,7 +5042,7 @@
|
|||
},
|
||||
{
|
||||
"name": "symfony/polyfill-php80",
|
||||
"version": "v1.36.0",
|
||||
"version": "v1.37.0",
|
||||
"source": {
|
||||
"type": "git",
|
||||
"url": "https://github.com/symfony/polyfill-php80.git",
|
||||
|
|
@ -5102,7 +5102,7 @@
|
|||
"shim"
|
||||
],
|
||||
"support": {
|
||||
"source": "https://github.com/symfony/polyfill-php80/tree/v1.36.0"
|
||||
"source": "https://github.com/symfony/polyfill-php80/tree/v1.37.0"
|
||||
},
|
||||
"funding": [
|
||||
{
|
||||
|
|
@ -5126,7 +5126,7 @@
|
|||
},
|
||||
{
|
||||
"name": "symfony/polyfill-php83",
|
||||
"version": "v1.36.0",
|
||||
"version": "v1.37.0",
|
||||
"source": {
|
||||
"type": "git",
|
||||
"url": "https://github.com/symfony/polyfill-php83.git",
|
||||
|
|
@ -5182,7 +5182,7 @@
|
|||
"shim"
|
||||
],
|
||||
"support": {
|
||||
"source": "https://github.com/symfony/polyfill-php83/tree/v1.36.0"
|
||||
"source": "https://github.com/symfony/polyfill-php83/tree/v1.37.0"
|
||||
},
|
||||
"funding": [
|
||||
{
|
||||
|
|
@ -5206,7 +5206,7 @@
|
|||
},
|
||||
{
|
||||
"name": "symfony/polyfill-php84",
|
||||
"version": "v1.36.0",
|
||||
"version": "v1.37.0",
|
||||
"source": {
|
||||
"type": "git",
|
||||
"url": "https://github.com/symfony/polyfill-php84.git",
|
||||
|
|
@ -5262,7 +5262,7 @@
|
|||
"shim"
|
||||
],
|
||||
"support": {
|
||||
"source": "https://github.com/symfony/polyfill-php84/tree/v1.36.0"
|
||||
"source": "https://github.com/symfony/polyfill-php84/tree/v1.37.0"
|
||||
},
|
||||
"funding": [
|
||||
{
|
||||
|
|
@ -5286,16 +5286,16 @@
|
|||
},
|
||||
{
|
||||
"name": "symfony/polyfill-php85",
|
||||
"version": "v1.36.0",
|
||||
"version": "v1.37.0",
|
||||
"source": {
|
||||
"type": "git",
|
||||
"url": "https://github.com/symfony/polyfill-php85.git",
|
||||
"reference": "2c408a6bb0313e6001a83628dc5506100474254e"
|
||||
"reference": "fcfa4973a9917cef23f2e38774da74a2b7d115ee"
|
||||
},
|
||||
"dist": {
|
||||
"type": "zip",
|
||||
"url": "https://api.github.com/repos/symfony/polyfill-php85/zipball/2c408a6bb0313e6001a83628dc5506100474254e",
|
||||
"reference": "2c408a6bb0313e6001a83628dc5506100474254e",
|
||||
"url": "https://api.github.com/repos/symfony/polyfill-php85/zipball/fcfa4973a9917cef23f2e38774da74a2b7d115ee",
|
||||
"reference": "fcfa4973a9917cef23f2e38774da74a2b7d115ee",
|
||||
"shasum": ""
|
||||
},
|
||||
"require": {
|
||||
|
|
@ -5342,7 +5342,7 @@
|
|||
"shim"
|
||||
],
|
||||
"support": {
|
||||
"source": "https://github.com/symfony/polyfill-php85/tree/v1.36.0"
|
||||
"source": "https://github.com/symfony/polyfill-php85/tree/v1.37.0"
|
||||
},
|
||||
"funding": [
|
||||
{
|
||||
|
|
@ -5362,11 +5362,11 @@
|
|||
"type": "tidelift"
|
||||
}
|
||||
],
|
||||
"time": "2026-04-10T16:50:15+00:00"
|
||||
"time": "2026-04-26T13:10:57+00:00"
|
||||
},
|
||||
{
|
||||
"name": "symfony/polyfill-uuid",
|
||||
"version": "v1.36.0",
|
||||
"version": "v1.37.0",
|
||||
"source": {
|
||||
"type": "git",
|
||||
"url": "https://github.com/symfony/polyfill-uuid.git",
|
||||
|
|
@ -5425,7 +5425,7 @@
|
|||
"uuid"
|
||||
],
|
||||
"support": {
|
||||
"source": "https://github.com/symfony/polyfill-uuid/tree/v1.36.0"
|
||||
"source": "https://github.com/symfony/polyfill-uuid/tree/v1.37.0"
|
||||
},
|
||||
"funding": [
|
||||
{
|
||||
|
|
@ -6263,16 +6263,16 @@
|
|||
},
|
||||
{
|
||||
"name": "voku/portable-ascii",
|
||||
"version": "2.1.0",
|
||||
"version": "2.1.1",
|
||||
"source": {
|
||||
"type": "git",
|
||||
"url": "https://github.com/voku/portable-ascii.git",
|
||||
"reference": "d870a33f0f79d2b4579740b0620200221ee44aeb"
|
||||
"reference": "8e1051fe39379367aecf014f41744ce7539a856f"
|
||||
},
|
||||
"dist": {
|
||||
"type": "zip",
|
||||
"url": "https://api.github.com/repos/voku/portable-ascii/zipball/d870a33f0f79d2b4579740b0620200221ee44aeb",
|
||||
"reference": "d870a33f0f79d2b4579740b0620200221ee44aeb",
|
||||
"url": "https://api.github.com/repos/voku/portable-ascii/zipball/8e1051fe39379367aecf014f41744ce7539a856f",
|
||||
"reference": "8e1051fe39379367aecf014f41744ce7539a856f",
|
||||
"shasum": ""
|
||||
},
|
||||
"require": {
|
||||
|
|
@ -6309,7 +6309,7 @@
|
|||
],
|
||||
"support": {
|
||||
"issues": "https://github.com/voku/portable-ascii/issues",
|
||||
"source": "https://github.com/voku/portable-ascii/tree/2.1.0"
|
||||
"source": "https://github.com/voku/portable-ascii/tree/2.1.1"
|
||||
},
|
||||
"funding": [
|
||||
{
|
||||
|
|
@ -6333,7 +6333,7 @@
|
|||
"type": "tidelift"
|
||||
}
|
||||
],
|
||||
"time": "2026-04-16T23:10:39+00:00"
|
||||
"time": "2026-04-26T05:33:54+00:00"
|
||||
}
|
||||
],
|
||||
"packages-dev": [
|
||||
|
|
@ -8649,7 +8649,6 @@
|
|||
"aliases": [],
|
||||
"minimum-stability": "stable",
|
||||
"stability-flags": {
|
||||
"fivefilters/readability.php": 20,
|
||||
"lvl0/fedi-discover": 20
|
||||
},
|
||||
"prefer-stable": true,
|
||||
|
|
@ -8658,5 +8657,5 @@
|
|||
"php": "^8.3"
|
||||
},
|
||||
"platform-dev": {},
|
||||
"plugin-api-version": "2.6.0"
|
||||
"plugin-api-version": "2.9.0"
|
||||
}
|
||||
|
|
|
|||
|
|
@ -260,6 +260,69 @@ public function test_success_calculates_word_count(): void
|
|||
$this->assertSame(9, $result->wordCount);
|
||||
}
|
||||
|
||||
public function test_uppercase_content_type_is_accepted_as_html(): void
|
||||
{
|
||||
Http::fake([
|
||||
'example.com/*' => Http::response(
|
||||
'<html><head><title>Uppercase CT</title></head><body><p>Content here.</p></body></html>',
|
||||
200,
|
||||
['Content-Type' => 'Text/HTML; charset=utf-8'],
|
||||
),
|
||||
]);
|
||||
|
||||
$result = $this->makeAction()('https://example.com/page');
|
||||
|
||||
$this->assertSame(CrawlOutcomeEnum::Success, $result->outcome);
|
||||
}
|
||||
|
||||
public function test_empty_href_is_filtered_from_outbound_links(): void
|
||||
{
|
||||
$html = <<<'HTML'
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head><title>Empty Href Test</title></head>
|
||||
<body>
|
||||
<article>
|
||||
<p>This paragraph has <a href="">an empty href anchor</a> that should be dropped.</p>
|
||||
</article>
|
||||
</body>
|
||||
</html>
|
||||
HTML;
|
||||
|
||||
Http::fake([
|
||||
'example.com/*' => Http::response($html, 200, ['Content-Type' => 'text/html']),
|
||||
]);
|
||||
|
||||
$result = $this->makeAction()('https://example.com/article');
|
||||
|
||||
$this->assertSame(CrawlOutcomeEnum::Success, $result->outcome);
|
||||
$this->assertSame(0, $result->outboundLinks->count());
|
||||
}
|
||||
|
||||
public function test_fragment_only_href_is_filtered_from_outbound_links(): void
|
||||
{
|
||||
$html = <<<'HTML'
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head><title>Fragment Href Test</title></head>
|
||||
<body>
|
||||
<article>
|
||||
<p>Jump to <a href="#section-2">section 2</a> of this page.</p>
|
||||
</article>
|
||||
</body>
|
||||
</html>
|
||||
HTML;
|
||||
|
||||
Http::fake([
|
||||
'example.com/*' => Http::response($html, 200, ['Content-Type' => 'text/html']),
|
||||
]);
|
||||
|
||||
$result = $this->makeAction()('https://example.com/article');
|
||||
|
||||
$this->assertSame(CrawlOutcomeEnum::Success, $result->outcome);
|
||||
$this->assertSame(0, $result->outboundLinks->count());
|
||||
}
|
||||
|
||||
private function makeAction(): FetchPageAction
|
||||
{
|
||||
return app(FetchPageAction::class);
|
||||
|
|
|
|||
Loading…
Reference in a new issue