Optimize article fetching

This commit is contained in:
myrmidex 2025-06-29 21:20:45 +02:00
parent 137cb4ebfc
commit 97edb507f6
10 changed files with 339 additions and 77 deletions

View file

@ -3,7 +3,9 @@
namespace App\Console\Commands; namespace App\Console\Commands;
use App\Models\Article; use App\Models\Article;
use App\Services\Article\LemmyService; use App\Modules\Lemmy\Services\LemmyPublisher;
use App\Services\Article\ArticleFetcher;
use Exception;
use Illuminate\Console\Command; use Illuminate\Console\Command;
class PublishToLemmyCommand extends Command class PublishToLemmyCommand extends Command
@ -16,7 +18,13 @@ public function handle(): int
{ {
$article = Article::all()->firstOrFail(); $article = Article::all()->firstOrFail();
LemmyService::publish($article); $this->info('Publishing article: ' . $article->url);
try {
LemmyPublisher::fromConfig()->publish($article, ArticleFetcher::fetchArticle($article));
} catch (Exception) {
return self::FAILURE;
}
return self::SUCCESS; return self::SUCCESS;
} }

View file

@ -3,13 +3,13 @@
namespace App\Listeners; namespace App\Listeners;
use App\Events\ArticleReadyToPublish; use App\Events\ArticleReadyToPublish;
use App\Services\Article\LemmyService; use App\Modules\Lemmy\Services\LemmyPublisher;
use App\Services\Article\ArticleFetcher;
class PublishArticle class PublishArticle
{ {
public function __construct() public function __construct()
{ {
//
} }
public function handle(ArticleReadyToPublish $event): void public function handle(ArticleReadyToPublish $event): void
@ -18,6 +18,6 @@ public function handle(ArticleReadyToPublish $event): void
logger('Publishing article: ' . $article->id . ' : ' . $article->url); logger('Publishing article: ' . $article->id . ' : ' . $article->url);
LemmyService::publish($article); LemmyPublisher::fromConfig()->publish($article, ArticleFetcher::fetchArticle($article));
} }
} }

View file

@ -24,13 +24,17 @@ class Article extends Model
protected $fillable = [ protected $fillable = [
'url', 'url',
'title',
'description',
'is_valid', 'is_valid',
'fetched_at',
'validated_at', 'validated_at',
]; ];
public function casts(): array public function casts(): array
{ {
return [ return [
'fetched_at' => 'datetime',
'validated_at' => 'datetime', 'validated_at' => 'datetime',
'created_at' => 'datetime', 'created_at' => 'datetime',
'updated_at' => 'datetime', 'updated_at' => 'datetime',

View file

@ -0,0 +1,50 @@
<?php
namespace App\Modules\Lemmy;
use Illuminate\Support\Facades\Http;
use Illuminate\Http\Client\Response;
class LemmyRequest
{
private string $instance;
private ?string $token;
public function __construct(string $instance, ?string $token = null)
{
$this->instance = $instance;
$this->token = $token;
}
public function get(string $endpoint, array $params = []): Response
{
$url = "https://{$this->instance}/api/v3/{$endpoint}";
$request = Http::timeout(30);
if ($this->token) {
$request = $request->withToken($this->token);
}
return $request->get($url, $params);
}
public function post(string $endpoint, array $data = []): Response
{
$url = "https://{$this->instance}/api/v3/{$endpoint}";
$request = Http::timeout(30);
if ($this->token) {
$request = $request->withToken($this->token);
}
return $request->post($url, $data);
}
public function withToken(string $token): self
{
$this->token = $token;
return $this;
}
}

View file

@ -0,0 +1,80 @@
<?php
namespace App\Modules\Lemmy\Services;
use App\Modules\Lemmy\LemmyRequest;
use Exception;
class LemmyApiService
{
private string $instance;
public function __construct(string $instance)
{
$this->instance = $instance;
}
public function login(string $username, string $password): ?string
{
try {
$request = new LemmyRequest($this->instance);
$response = $request->post('user/login', [
'username_or_email' => $username,
'password' => $password,
]);
if (!$response->successful()) {
logger()->error('Lemmy login failed', [
'status' => $response->status(),
'body' => $response->body()
]);
return null;
}
$data = $response->json();
return $data['jwt'] ?? null;
} catch (Exception $e) {
logger()->error('Lemmy login exception', ['error' => $e->getMessage()]);
return null;
}
}
public function getCommunityId(string $communityName): int
{
try {
$request = new LemmyRequest($this->instance);
$response = $request->get('community', ['name' => $communityName]);
if (!$response->successful()) {
throw new Exception('Failed to fetch community: ' . $response->status());
}
$data = $response->json();
return $data['community_view']['community']['id'] ?? throw new Exception('Community not found');
} catch (Exception $e) {
logger()->error('Community lookup failed', ['error' => $e->getMessage()]);
throw $e;
}
}
public function createPost(string $token, string $title, string $body, int $communityId): array
{
try {
$request = new LemmyRequest($this->instance, $token);
$response = $request->post('post', [
'name' => $title,
'body' => $body,
'community_id' => $communityId,
]);
if (!$response->successful()) {
throw new Exception('Failed to create post: ' . $response->status() . ' - ' . $response->body());
}
return $response->json();
} catch (Exception $e) {
logger()->error('Post creation failed', ['error' => $e->getMessage()]);
throw $e;
}
}
}

View file

@ -0,0 +1,86 @@
<?php
namespace App\Modules\Lemmy\Services;
use App\Models\Article;
use App\Models\ArticlePublication;
use Exception;
use Illuminate\Support\Facades\Cache;
class LemmyPublisher
{
private LemmyApiService $api;
private string $username;
private string $community;
public function __construct(string $instance, string $username, string $community)
{
$this->api = new LemmyApiService($instance);
$this->username = $username;
$this->community = $community;
}
public static function fromConfig(): self
{
return new self(
config('lemmy.instance'),
config('lemmy.username'),
config('lemmy.community')
);
}
public function publish(Article $article, array $extractedData): ArticlePublication
{
$token = $this->getAuthToken();
if (!$token) {
throw new Exception('Failed to authenticate with Lemmy');
}
$communityId = $this->getCommunityId();
$postData = $this->api->createPost(
$token,
$extractedData['title'] ?? 'Untitled',
$extractedData['description'] ?? '',
$communityId
);
return $this->createPublicationRecord($article, $postData, $communityId);
}
private function getAuthToken(): ?string
{
return Cache::remember('lemmy_jwt_token', 3600, function () {
$username = config('lemmy.username');
$password = config('lemmy.password');
if (!$username || !$password) {
logger()->error('Missing Lemmy credentials');
return null;
}
return $this->api->login($username, $password);
});
}
private function getCommunityId(): int
{
return Cache::remember("lemmy_community_id_{$this->community}", 3600, function () {
return $this->api->getCommunityId($this->community);
});
}
private function createPublicationRecord(Article $article, array $postData, int $communityId): ArticlePublication
{
return ArticlePublication::create([
'article_id' => $article->id,
'post_id' => $postData['post_view']['post']['id'],
'community_id' => $communityId,
'published_by' => $this->username,
'published_at' => now(),
'platform' => 'lemmy',
'publication_data' => $postData,
]);
}
}

View file

@ -0,0 +1,75 @@
<?php
namespace App\Services\Article;
class ArticleDataExtractor
{
public static function extractTitle(string $html): ?string
{
// Try meta title first
if (preg_match('/<meta property="og:title" content="([^"]+)"/i', $html, $matches)) {
return html_entity_decode($matches[1], ENT_QUOTES, 'UTF-8');
}
// Try h1 tag
if (preg_match('/<h1[^>]*>([^<]+)<\/h1>/i', $html, $matches)) {
return html_entity_decode(strip_tags($matches[1]), ENT_QUOTES, 'UTF-8');
}
// Try title tag
if (preg_match('/<title>([^<]+)<\/title>/i', $html, $matches)) {
return html_entity_decode($matches[1], ENT_QUOTES, 'UTF-8');
}
return null;
}
public static function extractDescription(string $html): ?string
{
// Try meta description first
if (preg_match('/<meta property="og:description" content="([^"]+)"/i', $html, $matches)) {
return html_entity_decode($matches[1], ENT_QUOTES, 'UTF-8');
}
// Try to find first paragraph in article content
if (preg_match('/<p[^>]*>([^<]+(?:<[^\/](?!p)[^>]*>[^<]*<\/[^>]*>[^<]*)*)<\/p>/i', $html, $matches)) {
return html_entity_decode(strip_tags($matches[1]), ENT_QUOTES, 'UTF-8');
}
return null;
}
public static function extractData(string $html): array
{
return [
'title' => self::extractTitle($html),
'description' => self::extractDescription($html),
'full_article' => self::extractFullArticle($html),
];
}
public static function extractFullArticle(string $html): ?string
{
// Remove scripts, styles, and other non-content elements
$cleanHtml = preg_replace('/<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/mi', '', $html);
$cleanHtml = preg_replace('/<style\b[^<]*(?:(?!<\/style>)<[^<]*)*<\/style>/mi', '', $cleanHtml);
// Extract all paragraph content
preg_match_all('/<p[^>]*>(.*?)<\/p>/is', $cleanHtml, $matches);
if (!empty($matches[1])) {
$paragraphs = array_map(function($paragraph) {
return html_entity_decode(strip_tags($paragraph), ENT_QUOTES, 'UTF-8');
}, $matches[1]);
// Filter out empty paragraphs and join with double newlines
$fullText = implode("\n\n", array_filter($paragraphs, function($p) {
return trim($p) !== '';
}));
return $fullText ?: null;
}
return null;
}
}

View file

@ -15,6 +15,33 @@ public static function getNewArticles(): Collection
->map(fn (string $url) => self::saveArticle($url)); ->map(fn (string $url) => self::saveArticle($url));
} }
public static function fetchArticle(Article $article): array
{
try {
$response = Http::get($article->url);
if (!$response->successful()) {
logger()->error('Failed to fetch article', [
'url' => $article->url,
'status' => $response->status()
]);
return [];
}
$html = $response->body();
return ArticleDataExtractor::extractData($html);
} catch (Exception $e) {
logger()->error('Exception while fetching article', [
'url' => $article->url,
'error' => $e->getMessage()
]);
return $article;
}
}
private static function fetchArticles(): Collection private static function fetchArticles(): Collection
{ {
try { try {
@ -60,7 +87,7 @@ private static function fetchArticles(): Collection
} }
} }
protected static function saveArticle(string $url): Article private static function saveArticle(string $url): Article
{ {
return Article::firstOrCreate(['url' => $url]); return Article::firstOrCreate(['url' => $url]);
} }

View file

@ -1,70 +0,0 @@
<?php
namespace App\Services\Article;
use App\Models\Article;
use Illuminate\Support\Facades\Http;
use Illuminate\Support\Facades\Cache;
use Exception;
class LemmyService
{
public static function publish(Article $article): bool
{
$jwt = self::getJwtToken();
dd(['jwt' => $jwt]);
// $instance = config('lemmy.instance');
// $community = config('lemmy.community');
//
// $response = file_get_contents("https://$instance/api/v3/community?name=$community");
// $data = json_decode($response, true);
//
// dd($data);
//// return $data['community_view']['community']['id'] ?? null;
//
// logger('publishing ' . $article . ' - ' . $article->url);
return true;
}
private static function getJwtToken(): ?string
{
return Cache::remember('lemmy_jwt_token', 3600, function () {
return self::login();
});
}
private static function login(): ?string
{
$username = config('lemmy.username');
$password = config('lemmy.password');
$instance = config('lemmy.instance');
if (!$username || !$password || !$instance) {
logger()->error('Missing Lemmy configuration');
return null;
}
try {
$response = Http::post("https://$instance/api/v3/user/login", [
'username_or_email' => $username,
'password' => $password,
]);
if (!$response->successful()) {
logger()->error('Lemmy login failed', [
'status' => $response->status(),
'body' => $response->body()
]);
return null;
}
$data = $response->json();
return $data['jwt'] ?? null;
} catch (Exception $e) {
logger()->error('Lemmy login exception', ['error' => $e->getMessage()]);
return null;
}
}
}

View file

@ -11,6 +11,8 @@ public function up(): void
Schema::create('articles', function (Blueprint $table) { Schema::create('articles', function (Blueprint $table) {
$table->id(); $table->id();
$table->string('url'); $table->string('url');
$table->string('title')->nullable();
$table->text('description')->nullable();
$table->boolean('is_valid')->nullable(); $table->boolean('is_valid')->nullable();
$table->timestamp('validated_at')->nullable(); $table->timestamp('validated_at')->nullable();
$table->timestamps(); $table->timestamps();