fedi_slurp/fedi_slurp.php

246 lines
6 KiB
PHP
Executable file
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/php
<?php
require("add_to_fedilist.php");
//-----------------------------
// CREDENTIALS
//-----------------------------
$MINIMUM_TEXT_SIZE = 500; // article with less characters of content will be ignored
$fediAccounts = loadAccounts(__DIR__ . '/_credentials/fedi_accounts.txt');
$readeckAccount = loadAccounts(__DIR__ . '/_credentials/readeck_account.txt');
// _credentials/readeck_account.txt
// should have only one line with host|token
// ex: gone.lema.org|XXXXYYYXXXYYY
$acc = $readeckAccount[0];
$READECK_HOST = $acc['host'];
$READECK_TOKEN = $acc['token'];
echo "Readeck Host: $READECK_HOST \n";
echo "Fedi Accounts to loop: ".count($fediAccounts)."\n";
// _credentials/fedi_accountst.txt
// each line like with host|token
// ex: gotosocial.lema.org|XXXXYYYXXXYYY
foreach ($fediAccounts as $acc) {
$MASTODON_HOST = $acc['host'];
$MASTODON_TOKEN = $acc['token'];
echo "";
echo "";
echo "--------------------------------\n";
echo "Host: $MASTODON_HOST\n";
echo "Token: $MASTODON_TOKEN\n";
echo "--------------------------------\n";
echo "";
//-----------------------------
// FETCH MASTODON BOOKMARKS
//-----------------------------
echo "# Fetching mastodon / gotosocial / snac bookmarks...\n";
date_default_timezone_set('America/Sao_Paulo');
echo date('Y-m-d H:i:s')."\n";
$ch = curl_init("https://$MASTODON_HOST/api/v1/bookmarks");
#GotoSocial will reply with error "I am a teapot" if no user agent is sent...
curl_setopt_array($ch, [
CURLOPT_RETURNTRANSFER => true,
CURLOPT_USERAGENT => "FediSlurperScript/1.0 (https://code.lema.org/santiago/fedi_slurp)",
CURLOPT_HTTPHEADER => [
"Authorization: Bearer $MASTODON_TOKEN",
"Accept: application/json"
]
]);
$bookmarksJson = curl_exec($ch);
$bookmarks = json_decode($bookmarksJson, true);
if (!is_array($bookmarks)) {
die("❌ Failed to parse Mastodon bookmarks.\n");
}
echo "Found bookmarks:".count($bookmarks)."\n";
#print_r($bookmarks);
//-----------------------------
// FIND VALID URLs in posts
//-----------------------------
foreach ($bookmarks as $status) {
if (!isset($status['content'])) {
continue;
}
$content = strip_tags($status['content']);
preg_match_all('/https?:\/\/[^\s"<]+/', $content, $matches);
if (!empty($matches[0])) {
$oneLink = $matches[0][0];
if (filter_var($oneLink, FILTER_VALIDATE_URL)) {
$links[] = $oneLink;
} else {
// This happens for example if URL has an emoji at the end
echo "INVALID URL: $oneLink\n";
}
}
}
if (isset($links))
{
echo "Valid URLS:".count($links)."\n";
print_r($links);
}
else
{
echo "NO links founds. Kthxbye \n";
die(0);
}
//-----------------------------
// SEND LINKS TO READECK
//-----------------------------
$apiUrl = "https://$READECK_HOST/api/bookmarks";
$ch = curl_init();
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_POST, true);
curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0');
$headers = [
"Authorization: Bearer $READECK_TOKEN",
'Accept: application/json',
'Content-Type: application/json'
];
$alreadySentDir = __DIR__ . "/_already_sent";
if (!is_dir($alreadySentDir)) {
mkdir($alreadySentDir, 0755, true); // recursive mkdir
}
foreach ($links as $link) {
if (isYouTubeLink($link)) {
addVideoToFediList($link);
continue;
}
// READECK will accept several times the same URL !
// Make sure we don't send it several times by keeping an archive here
$hash = md5($link);
$filePath = __DIR__ . "/_already_sent/{$hash}.txt";
if (file_exists($filePath)) {
echo " Already sent: $link\n";
continue;
}
$options = [
'http' => [
'method' => 'GET',
'header' => "User-Agent: Mozilla/5.0\r\n"
]
];
// First check if page has content
//$ch = curl_init($link);;
curl_setopt($ch, CURLOPT_URL, $link);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
$content = curl_exec($ch);
if ($content === false) {
echo "❌ Failed to fetch $link\n";
continue;
}
$plainText = strip_tags($content);
if (strlen($plainText) < $MINIMUM_TEXT_SIZE) {
echo "⚠️ Skipping $link\ncontent too small (".strlen($plainText)." chars < $MINIMUM_TEXT_SIZE )\n";
continue;
}
echo "🟢 Will add to Readeck $link\nLength: " . strlen($plainText)."\n";
//not passing title here, since we don't have it
$payload = json_encode([
"labels" => ["automasto"],
"url" => $link
]);
curl_setopt($ch, CURLOPT_URL, $apiUrl);
curl_setopt($ch, CURLOPT_POSTFIELDS, $payload);
curl_setopt($ch, CURLOPT_HTTPHEADER, $headers);
$response = curl_exec($ch);
$httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
if (curl_errno($ch)) {
echo "❌ Error adding $link: " . curl_error($ch) . "\n";
} else {
// Store already sent file only if connection worked
file_put_contents($filePath, $link);
$json = json_decode($response, true);
if (json_last_error() === JSON_ERROR_NONE) {
if ($httpCode >= 200 && $httpCode < 300) {
echo "✅ [$httpCode] Successfully added: $link\n";
} else {
echo "⚠️ Server returned status $httpCode for $link\n";
}
} else {
echo "⚠️ Response is not valid JSON for $link: $response\n";
}
}
}
curl_close($ch);
} // end accounts loop
function loadAccounts(string $filepath): array {
$accounts = [];
if (!file_exists($filepath)) {
return $accounts; // empty if file not found
}
$lines = file($filepath, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);
foreach ($lines as $line) {
$line = trim($line);
if ($line === '') continue;
[$host, $token] = explode('|', $line, 2);
$accounts[] = [
'host' => $host,
'token' => $token
];
}
return $accounts;
}