Optimize article fetching
This commit is contained in:
parent
137cb4ebfc
commit
97edb507f6
10 changed files with 339 additions and 77 deletions
|
|
@ -3,7 +3,9 @@
|
||||||
namespace App\Console\Commands;
|
namespace App\Console\Commands;
|
||||||
|
|
||||||
use App\Models\Article;
|
use App\Models\Article;
|
||||||
use App\Services\Article\LemmyService;
|
use App\Modules\Lemmy\Services\LemmyPublisher;
|
||||||
|
use App\Services\Article\ArticleFetcher;
|
||||||
|
use Exception;
|
||||||
use Illuminate\Console\Command;
|
use Illuminate\Console\Command;
|
||||||
|
|
||||||
class PublishToLemmyCommand extends Command
|
class PublishToLemmyCommand extends Command
|
||||||
|
|
@ -16,7 +18,13 @@ public function handle(): int
|
||||||
{
|
{
|
||||||
$article = Article::all()->firstOrFail();
|
$article = Article::all()->firstOrFail();
|
||||||
|
|
||||||
LemmyService::publish($article);
|
$this->info('Publishing article: ' . $article->url);
|
||||||
|
|
||||||
|
try {
|
||||||
|
LemmyPublisher::fromConfig()->publish($article, ArticleFetcher::fetchArticle($article));
|
||||||
|
} catch (Exception) {
|
||||||
|
return self::FAILURE;
|
||||||
|
}
|
||||||
|
|
||||||
return self::SUCCESS;
|
return self::SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -3,13 +3,13 @@
|
||||||
namespace App\Listeners;
|
namespace App\Listeners;
|
||||||
|
|
||||||
use App\Events\ArticleReadyToPublish;
|
use App\Events\ArticleReadyToPublish;
|
||||||
use App\Services\Article\LemmyService;
|
use App\Modules\Lemmy\Services\LemmyPublisher;
|
||||||
|
use App\Services\Article\ArticleFetcher;
|
||||||
|
|
||||||
class PublishArticle
|
class PublishArticle
|
||||||
{
|
{
|
||||||
public function __construct()
|
public function __construct()
|
||||||
{
|
{
|
||||||
//
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public function handle(ArticleReadyToPublish $event): void
|
public function handle(ArticleReadyToPublish $event): void
|
||||||
|
|
@ -18,6 +18,6 @@ public function handle(ArticleReadyToPublish $event): void
|
||||||
|
|
||||||
logger('Publishing article: ' . $article->id . ' : ' . $article->url);
|
logger('Publishing article: ' . $article->id . ' : ' . $article->url);
|
||||||
|
|
||||||
LemmyService::publish($article);
|
LemmyPublisher::fromConfig()->publish($article, ArticleFetcher::fetchArticle($article));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -24,13 +24,17 @@ class Article extends Model
|
||||||
|
|
||||||
protected $fillable = [
|
protected $fillable = [
|
||||||
'url',
|
'url',
|
||||||
|
'title',
|
||||||
|
'description',
|
||||||
'is_valid',
|
'is_valid',
|
||||||
|
'fetched_at',
|
||||||
'validated_at',
|
'validated_at',
|
||||||
];
|
];
|
||||||
|
|
||||||
public function casts(): array
|
public function casts(): array
|
||||||
{
|
{
|
||||||
return [
|
return [
|
||||||
|
'fetched_at' => 'datetime',
|
||||||
'validated_at' => 'datetime',
|
'validated_at' => 'datetime',
|
||||||
'created_at' => 'datetime',
|
'created_at' => 'datetime',
|
||||||
'updated_at' => 'datetime',
|
'updated_at' => 'datetime',
|
||||||
|
|
|
||||||
50
app/Modules/Lemmy/LemmyRequest.php
Normal file
50
app/Modules/Lemmy/LemmyRequest.php
Normal file
|
|
@ -0,0 +1,50 @@
|
||||||
|
<?php
|
||||||
|
|
||||||
|
namespace App\Modules\Lemmy;
|
||||||
|
|
||||||
|
use Illuminate\Support\Facades\Http;
|
||||||
|
use Illuminate\Http\Client\Response;
|
||||||
|
|
||||||
|
class LemmyRequest
|
||||||
|
{
|
||||||
|
private string $instance;
|
||||||
|
private ?string $token;
|
||||||
|
|
||||||
|
public function __construct(string $instance, ?string $token = null)
|
||||||
|
{
|
||||||
|
$this->instance = $instance;
|
||||||
|
$this->token = $token;
|
||||||
|
}
|
||||||
|
|
||||||
|
public function get(string $endpoint, array $params = []): Response
|
||||||
|
{
|
||||||
|
$url = "https://{$this->instance}/api/v3/{$endpoint}";
|
||||||
|
|
||||||
|
$request = Http::timeout(30);
|
||||||
|
|
||||||
|
if ($this->token) {
|
||||||
|
$request = $request->withToken($this->token);
|
||||||
|
}
|
||||||
|
|
||||||
|
return $request->get($url, $params);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function post(string $endpoint, array $data = []): Response
|
||||||
|
{
|
||||||
|
$url = "https://{$this->instance}/api/v3/{$endpoint}";
|
||||||
|
|
||||||
|
$request = Http::timeout(30);
|
||||||
|
|
||||||
|
if ($this->token) {
|
||||||
|
$request = $request->withToken($this->token);
|
||||||
|
}
|
||||||
|
|
||||||
|
return $request->post($url, $data);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function withToken(string $token): self
|
||||||
|
{
|
||||||
|
$this->token = $token;
|
||||||
|
return $this;
|
||||||
|
}
|
||||||
|
}
|
||||||
80
app/Modules/Lemmy/Services/LemmyApiService.php
Normal file
80
app/Modules/Lemmy/Services/LemmyApiService.php
Normal file
|
|
@ -0,0 +1,80 @@
|
||||||
|
<?php
|
||||||
|
|
||||||
|
namespace App\Modules\Lemmy\Services;
|
||||||
|
|
||||||
|
use App\Modules\Lemmy\LemmyRequest;
|
||||||
|
use Exception;
|
||||||
|
|
||||||
|
class LemmyApiService
|
||||||
|
{
|
||||||
|
private string $instance;
|
||||||
|
|
||||||
|
public function __construct(string $instance)
|
||||||
|
{
|
||||||
|
$this->instance = $instance;
|
||||||
|
}
|
||||||
|
|
||||||
|
public function login(string $username, string $password): ?string
|
||||||
|
{
|
||||||
|
try {
|
||||||
|
$request = new LemmyRequest($this->instance);
|
||||||
|
$response = $request->post('user/login', [
|
||||||
|
'username_or_email' => $username,
|
||||||
|
'password' => $password,
|
||||||
|
]);
|
||||||
|
|
||||||
|
if (!$response->successful()) {
|
||||||
|
logger()->error('Lemmy login failed', [
|
||||||
|
'status' => $response->status(),
|
||||||
|
'body' => $response->body()
|
||||||
|
]);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
$data = $response->json();
|
||||||
|
return $data['jwt'] ?? null;
|
||||||
|
} catch (Exception $e) {
|
||||||
|
logger()->error('Lemmy login exception', ['error' => $e->getMessage()]);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public function getCommunityId(string $communityName): int
|
||||||
|
{
|
||||||
|
try {
|
||||||
|
$request = new LemmyRequest($this->instance);
|
||||||
|
$response = $request->get('community', ['name' => $communityName]);
|
||||||
|
|
||||||
|
if (!$response->successful()) {
|
||||||
|
throw new Exception('Failed to fetch community: ' . $response->status());
|
||||||
|
}
|
||||||
|
|
||||||
|
$data = $response->json();
|
||||||
|
return $data['community_view']['community']['id'] ?? throw new Exception('Community not found');
|
||||||
|
} catch (Exception $e) {
|
||||||
|
logger()->error('Community lookup failed', ['error' => $e->getMessage()]);
|
||||||
|
throw $e;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public function createPost(string $token, string $title, string $body, int $communityId): array
|
||||||
|
{
|
||||||
|
try {
|
||||||
|
$request = new LemmyRequest($this->instance, $token);
|
||||||
|
$response = $request->post('post', [
|
||||||
|
'name' => $title,
|
||||||
|
'body' => $body,
|
||||||
|
'community_id' => $communityId,
|
||||||
|
]);
|
||||||
|
|
||||||
|
if (!$response->successful()) {
|
||||||
|
throw new Exception('Failed to create post: ' . $response->status() . ' - ' . $response->body());
|
||||||
|
}
|
||||||
|
|
||||||
|
return $response->json();
|
||||||
|
} catch (Exception $e) {
|
||||||
|
logger()->error('Post creation failed', ['error' => $e->getMessage()]);
|
||||||
|
throw $e;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
86
app/Modules/Lemmy/Services/LemmyPublisher.php
Normal file
86
app/Modules/Lemmy/Services/LemmyPublisher.php
Normal file
|
|
@ -0,0 +1,86 @@
|
||||||
|
<?php
|
||||||
|
|
||||||
|
namespace App\Modules\Lemmy\Services;
|
||||||
|
|
||||||
|
use App\Models\Article;
|
||||||
|
use App\Models\ArticlePublication;
|
||||||
|
use Exception;
|
||||||
|
use Illuminate\Support\Facades\Cache;
|
||||||
|
|
||||||
|
class LemmyPublisher
|
||||||
|
{
|
||||||
|
private LemmyApiService $api;
|
||||||
|
private string $username;
|
||||||
|
private string $community;
|
||||||
|
|
||||||
|
public function __construct(string $instance, string $username, string $community)
|
||||||
|
{
|
||||||
|
$this->api = new LemmyApiService($instance);
|
||||||
|
$this->username = $username;
|
||||||
|
$this->community = $community;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static function fromConfig(): self
|
||||||
|
{
|
||||||
|
return new self(
|
||||||
|
config('lemmy.instance'),
|
||||||
|
config('lemmy.username'),
|
||||||
|
config('lemmy.community')
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function publish(Article $article, array $extractedData): ArticlePublication
|
||||||
|
{
|
||||||
|
$token = $this->getAuthToken();
|
||||||
|
|
||||||
|
if (!$token) {
|
||||||
|
throw new Exception('Failed to authenticate with Lemmy');
|
||||||
|
}
|
||||||
|
|
||||||
|
$communityId = $this->getCommunityId();
|
||||||
|
|
||||||
|
$postData = $this->api->createPost(
|
||||||
|
$token,
|
||||||
|
$extractedData['title'] ?? 'Untitled',
|
||||||
|
$extractedData['description'] ?? '',
|
||||||
|
$communityId
|
||||||
|
);
|
||||||
|
|
||||||
|
return $this->createPublicationRecord($article, $postData, $communityId);
|
||||||
|
}
|
||||||
|
|
||||||
|
private function getAuthToken(): ?string
|
||||||
|
{
|
||||||
|
return Cache::remember('lemmy_jwt_token', 3600, function () {
|
||||||
|
$username = config('lemmy.username');
|
||||||
|
$password = config('lemmy.password');
|
||||||
|
|
||||||
|
if (!$username || !$password) {
|
||||||
|
logger()->error('Missing Lemmy credentials');
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
return $this->api->login($username, $password);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
private function getCommunityId(): int
|
||||||
|
{
|
||||||
|
return Cache::remember("lemmy_community_id_{$this->community}", 3600, function () {
|
||||||
|
return $this->api->getCommunityId($this->community);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
private function createPublicationRecord(Article $article, array $postData, int $communityId): ArticlePublication
|
||||||
|
{
|
||||||
|
return ArticlePublication::create([
|
||||||
|
'article_id' => $article->id,
|
||||||
|
'post_id' => $postData['post_view']['post']['id'],
|
||||||
|
'community_id' => $communityId,
|
||||||
|
'published_by' => $this->username,
|
||||||
|
'published_at' => now(),
|
||||||
|
'platform' => 'lemmy',
|
||||||
|
'publication_data' => $postData,
|
||||||
|
]);
|
||||||
|
}
|
||||||
|
}
|
||||||
75
app/Services/Article/ArticleDataExtractor.php
Normal file
75
app/Services/Article/ArticleDataExtractor.php
Normal file
|
|
@ -0,0 +1,75 @@
|
||||||
|
<?php
|
||||||
|
|
||||||
|
namespace App\Services\Article;
|
||||||
|
|
||||||
|
class ArticleDataExtractor
|
||||||
|
{
|
||||||
|
public static function extractTitle(string $html): ?string
|
||||||
|
{
|
||||||
|
// Try meta title first
|
||||||
|
if (preg_match('/<meta property="og:title" content="([^"]+)"/i', $html, $matches)) {
|
||||||
|
return html_entity_decode($matches[1], ENT_QUOTES, 'UTF-8');
|
||||||
|
}
|
||||||
|
|
||||||
|
// Try h1 tag
|
||||||
|
if (preg_match('/<h1[^>]*>([^<]+)<\/h1>/i', $html, $matches)) {
|
||||||
|
return html_entity_decode(strip_tags($matches[1]), ENT_QUOTES, 'UTF-8');
|
||||||
|
}
|
||||||
|
|
||||||
|
// Try title tag
|
||||||
|
if (preg_match('/<title>([^<]+)<\/title>/i', $html, $matches)) {
|
||||||
|
return html_entity_decode($matches[1], ENT_QUOTES, 'UTF-8');
|
||||||
|
}
|
||||||
|
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static function extractDescription(string $html): ?string
|
||||||
|
{
|
||||||
|
// Try meta description first
|
||||||
|
if (preg_match('/<meta property="og:description" content="([^"]+)"/i', $html, $matches)) {
|
||||||
|
return html_entity_decode($matches[1], ENT_QUOTES, 'UTF-8');
|
||||||
|
}
|
||||||
|
|
||||||
|
// Try to find first paragraph in article content
|
||||||
|
if (preg_match('/<p[^>]*>([^<]+(?:<[^\/](?!p)[^>]*>[^<]*<\/[^>]*>[^<]*)*)<\/p>/i', $html, $matches)) {
|
||||||
|
return html_entity_decode(strip_tags($matches[1]), ENT_QUOTES, 'UTF-8');
|
||||||
|
}
|
||||||
|
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static function extractData(string $html): array
|
||||||
|
{
|
||||||
|
return [
|
||||||
|
'title' => self::extractTitle($html),
|
||||||
|
'description' => self::extractDescription($html),
|
||||||
|
'full_article' => self::extractFullArticle($html),
|
||||||
|
];
|
||||||
|
}
|
||||||
|
|
||||||
|
public static function extractFullArticle(string $html): ?string
|
||||||
|
{
|
||||||
|
// Remove scripts, styles, and other non-content elements
|
||||||
|
$cleanHtml = preg_replace('/<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/mi', '', $html);
|
||||||
|
$cleanHtml = preg_replace('/<style\b[^<]*(?:(?!<\/style>)<[^<]*)*<\/style>/mi', '', $cleanHtml);
|
||||||
|
|
||||||
|
// Extract all paragraph content
|
||||||
|
preg_match_all('/<p[^>]*>(.*?)<\/p>/is', $cleanHtml, $matches);
|
||||||
|
|
||||||
|
if (!empty($matches[1])) {
|
||||||
|
$paragraphs = array_map(function($paragraph) {
|
||||||
|
return html_entity_decode(strip_tags($paragraph), ENT_QUOTES, 'UTF-8');
|
||||||
|
}, $matches[1]);
|
||||||
|
|
||||||
|
// Filter out empty paragraphs and join with double newlines
|
||||||
|
$fullText = implode("\n\n", array_filter($paragraphs, function($p) {
|
||||||
|
return trim($p) !== '';
|
||||||
|
}));
|
||||||
|
|
||||||
|
return $fullText ?: null;
|
||||||
|
}
|
||||||
|
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -15,6 +15,33 @@ public static function getNewArticles(): Collection
|
||||||
->map(fn (string $url) => self::saveArticle($url));
|
->map(fn (string $url) => self::saveArticle($url));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static function fetchArticle(Article $article): array
|
||||||
|
{
|
||||||
|
try {
|
||||||
|
$response = Http::get($article->url);
|
||||||
|
|
||||||
|
if (!$response->successful()) {
|
||||||
|
logger()->error('Failed to fetch article', [
|
||||||
|
'url' => $article->url,
|
||||||
|
'status' => $response->status()
|
||||||
|
]);
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
|
||||||
|
$html = $response->body();
|
||||||
|
|
||||||
|
return ArticleDataExtractor::extractData($html);
|
||||||
|
|
||||||
|
} catch (Exception $e) {
|
||||||
|
logger()->error('Exception while fetching article', [
|
||||||
|
'url' => $article->url,
|
||||||
|
'error' => $e->getMessage()
|
||||||
|
]);
|
||||||
|
return $article;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
private static function fetchArticles(): Collection
|
private static function fetchArticles(): Collection
|
||||||
{
|
{
|
||||||
try {
|
try {
|
||||||
|
|
@ -60,7 +87,7 @@ private static function fetchArticles(): Collection
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
protected static function saveArticle(string $url): Article
|
private static function saveArticle(string $url): Article
|
||||||
{
|
{
|
||||||
return Article::firstOrCreate(['url' => $url]);
|
return Article::firstOrCreate(['url' => $url]);
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -1,70 +0,0 @@
|
||||||
<?php
|
|
||||||
|
|
||||||
namespace App\Services\Article;
|
|
||||||
|
|
||||||
use App\Models\Article;
|
|
||||||
use Illuminate\Support\Facades\Http;
|
|
||||||
use Illuminate\Support\Facades\Cache;
|
|
||||||
use Exception;
|
|
||||||
|
|
||||||
class LemmyService
|
|
||||||
{
|
|
||||||
public static function publish(Article $article): bool
|
|
||||||
{
|
|
||||||
$jwt = self::getJwtToken();
|
|
||||||
dd(['jwt' => $jwt]);
|
|
||||||
|
|
||||||
// $instance = config('lemmy.instance');
|
|
||||||
// $community = config('lemmy.community');
|
|
||||||
//
|
|
||||||
// $response = file_get_contents("https://$instance/api/v3/community?name=$community");
|
|
||||||
// $data = json_decode($response, true);
|
|
||||||
//
|
|
||||||
// dd($data);
|
|
||||||
//// return $data['community_view']['community']['id'] ?? null;
|
|
||||||
//
|
|
||||||
// logger('publishing ' . $article . ' - ' . $article->url);
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
private static function getJwtToken(): ?string
|
|
||||||
{
|
|
||||||
return Cache::remember('lemmy_jwt_token', 3600, function () {
|
|
||||||
return self::login();
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
private static function login(): ?string
|
|
||||||
{
|
|
||||||
$username = config('lemmy.username');
|
|
||||||
$password = config('lemmy.password');
|
|
||||||
$instance = config('lemmy.instance');
|
|
||||||
|
|
||||||
if (!$username || !$password || !$instance) {
|
|
||||||
logger()->error('Missing Lemmy configuration');
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
try {
|
|
||||||
$response = Http::post("https://$instance/api/v3/user/login", [
|
|
||||||
'username_or_email' => $username,
|
|
||||||
'password' => $password,
|
|
||||||
]);
|
|
||||||
|
|
||||||
if (!$response->successful()) {
|
|
||||||
logger()->error('Lemmy login failed', [
|
|
||||||
'status' => $response->status(),
|
|
||||||
'body' => $response->body()
|
|
||||||
]);
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
$data = $response->json();
|
|
||||||
return $data['jwt'] ?? null;
|
|
||||||
} catch (Exception $e) {
|
|
||||||
logger()->error('Lemmy login exception', ['error' => $e->getMessage()]);
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
@ -11,6 +11,8 @@ public function up(): void
|
||||||
Schema::create('articles', function (Blueprint $table) {
|
Schema::create('articles', function (Blueprint $table) {
|
||||||
$table->id();
|
$table->id();
|
||||||
$table->string('url');
|
$table->string('url');
|
||||||
|
$table->string('title')->nullable();
|
||||||
|
$table->text('description')->nullable();
|
||||||
$table->boolean('is_valid')->nullable();
|
$table->boolean('is_valid')->nullable();
|
||||||
$table->timestamp('validated_at')->nullable();
|
$table->timestamp('validated_at')->nullable();
|
||||||
$table->timestamps();
|
$table->timestamps();
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue