From a5c3105a7af60a413f65b526e2045c8ccc5d7264 Mon Sep 17 00:00:00 2001 From: sweing Date: Tue, 9 Jan 2024 18:51:15 +0100 Subject: [PATCH 01/12] Remove Twitter, add Mastodon --- api/__init__.py | 4 +- api/modules/{tweets.py => toots.py} | 38 ++--- config.template.json | 10 ++ downloadFromApi.py | 43 ++++++ getOldToots.py | 61 ++++++++ lib/Toot.py | 152 +++++++++++++++++++ lib/TootForest.py | 218 ++++++++++++++++++++++++++++ lib/Tweet.py | 146 ------------------- lib/TweetForest.py | 108 -------------- lib/mastodon_auth.py | 15 ++ lib/mastodon_base.py | 55 +++++++ lib/shared.py | 6 +- lib/{tweets_aux.py => toots_aux.py} | 15 +- lib/tweets_base.py | 43 ------ lib/twitter_auth.py | 18 --- runInit.py | 3 +- runListener.py | 100 +++++++++---- 17 files changed, 660 insertions(+), 375 deletions(-) rename api/modules/{tweets.py => toots.py} (69%) create mode 100644 downloadFromApi.py create mode 100644 getOldToots.py create mode 100644 lib/Toot.py create mode 100644 lib/TootForest.py delete mode 100644 lib/Tweet.py delete mode 100644 lib/TweetForest.py create mode 100644 lib/mastodon_auth.py create mode 100644 lib/mastodon_base.py rename lib/{tweets_aux.py => toots_aux.py} (70%) delete mode 100644 lib/tweets_base.py delete mode 100644 lib/twitter_auth.py diff --git a/api/__init__.py b/api/__init__.py index e1b8ada..255f864 100644 --- a/api/__init__.py +++ b/api/__init__.py @@ -1,6 +1,6 @@ from flask import Flask, current_app, render_template, request, session from flask_cors import CORS -from .modules import tweets, log +from .modules import toots, log from .modules.auth import login_exempt from lib.shared import authToken, secretKey from datetime import timedelta @@ -12,7 +12,7 @@ def create_app(test_config = None): cors = CORS(app) - app.register_blueprint(tweets.bp) + app.register_blueprint(toots.bp) app.register_blueprint(log.bp) @app.route('/') diff --git a/api/modules/tweets.py b/api/modules/toots.py similarity index 69% rename from api/modules/tweets.py rename to api/modules/toots.py index cddbbfb..920d4e3 100644 --- a/api/modules/tweets.py +++ b/api/modules/toots.py @@ -1,13 +1,14 @@ from datetime import datetime import logging +import json from flask import Blueprint, jsonify, Response, render_template, abort from flask_cors import cross_origin from .auth import login_exempt -from lib.tweets_base import readTweetsApiJson -from lib.Tweet import Tweet -from lib.TweetForest import TweetForest +from lib.mastodon_base import readTootsApiJson +from lib.Toot import Toot +from lib.TootForest import TootForest bp = Blueprint('tweets', __name__, url_prefix='/tweets') @@ -15,37 +16,36 @@ @cross_origin() @login_exempt def root(): - return jsonify({ + return Response(json.dumps({ 'date': int(datetime.timestamp(datetime.now())), - 'tweets': readTweetsApiJson() - }) + 'tweets': readTootsApiJson() + }), mimetype='application/json') @bp.route('/') def tweet(id): - tweet = Tweet.loadFromFile(id) + tweet = Toot.loadFromFile(id) return jsonify(tweet.data) @bp.route('/forest') def forest_show(): - return render_template('forest.html.j2', forest = TweetForest.fromFolder()) + return render_template('forest.html.j2', forest = TootForest.fromFolder()) #return Response(str(forest), mimetype='text/plain') @bp.route('/forest/renew') def forest_create(): logging.warning('Manual invocation of creating forest!') - # renew forest - forest = TweetForest.fromFolder() + forest = TootForest.fromFolder() forest.saveApiJson() - return jsonify(readTweetsApiJson()) + return Response(json.dumps(readTootsApiJson()), mimetype='application/json') @bp.route('/add/') def add(id): logging.warning('Manual invocation of adding tweet (id: {})!'.format(id)) - tweet = Tweet.loadFromTwitter(id) + tweet = Toot.loadFromMastodon(id) tweet.save() # renew forest - forest = TweetForest.fromFolder() + forest = TootForest.fromFolder() forest.saveApiJson() return jsonify({ 'message': 'added', @@ -58,10 +58,10 @@ def add(id): @bp.route('/delete/') def delete(id): logging.warning('Manual invocation of deleting tweet (id: {})!'.format(id)) - tweet = Tweet.loadFromFile(id) + tweet = Toot.loadFromFile(id) tweet.delete() # renew forest - forest = TweetForest.fromFolder() + forest = TootForest.fromFolder() forest.saveApiJson() return jsonify({ 'message': 'deleted', @@ -73,19 +73,19 @@ def delete(id): @bp.route('/all') def all(): - tweets = Tweet.loadFromFolder() + tweets = Toot.loadFromFolder() tweets.sort(key = lambda x: x.getDateTime(), reverse = True) - # [Tweet(i) for i in tweets] + # [Toot(i) for i in tweets] return render_template('all.html.j2', tweets = tweets) @bp.route('/stories') def info(): - tweets = readTweetsApiJson() + tweets = readTootsApiJson() stories = {}; for id, info in tweets.items(): if 'story' in info: storyId = info['story'] if storyId not in stories: stories[storyId] = [] - stories[storyId].append(Tweet.loadFromFile(id)) + stories[storyId].append(Toot.loadFromFile(id)) return render_template('stories.html.j2', stories = stories) diff --git a/config.template.json b/config.template.json index f54f6a2..6cf829d 100644 --- a/config.template.json +++ b/config.template.json @@ -1,6 +1,16 @@ { "authToken": "xxx", "secretKey": "xxx", + "mastodon": { + "instance_url": "https://mastodon.instance", + "client": { + "key": "xxx", + "secret": "xxx" + }, + "access": { + "token": "xxx" + } + }, "twitter": { "api": { "key": "xxx", diff --git a/downloadFromApi.py b/downloadFromApi.py new file mode 100644 index 0000000..c4e7585 --- /dev/null +++ b/downloadFromApi.py @@ -0,0 +1,43 @@ +import os +import requests +import json +from lib.shared import authToken + +BASE_URL = "https://dev.decarbnow.space" +AUTH_URL = BASE_URL + "/authenticate/" +DOWNLOAD_URL = BASE_URL + "/tweets/" + +# Create a session to maintain authentication +session = requests.Session() + +# Authenticate using the provided token +auth_response = session.get(AUTH_URL + authToken) +if auth_response.status_code != 200: + print(f"Authentication failed with status code: {auth_response.status_code}") + exit() + +# Path to the data folder +data_folder = "data" +os.makedirs(data_folder, exist_ok=True) + +# Load tweet IDs from fromAPI.json +with open(os.path.join(data_folder, "fromAPI.json"), "r") as json_file: + tweet_data = json.load(json_file) + tweet_ids = tweet_data.get("tweets", {}).keys() + +# Download files for each tweet ID +for tweet_id in tweet_ids: + download_url = DOWNLOAD_URL + tweet_id + + # Send request to download the file + file_response = session.get(download_url) + if file_response.status_code == 200: + # Save the file in the data folder + with open(os.path.join(data_folder, "live", f"{tweet_id}.json"), "wb") as file: + file.write(file_response.content) + print(f"Downloaded file for tweet ID {tweet_id}") + else: + print(f"Failed to download file for tweet ID {tweet_id} with status code: {file_response.status_code}") + +# Close the session +session.close() diff --git a/getOldToots.py b/getOldToots.py new file mode 100644 index 0000000..728063f --- /dev/null +++ b/getOldToots.py @@ -0,0 +1,61 @@ +from mastodon import Mastodon +import json +from datetime import datetime +import os +import shutil +from lib.mastodon_auth import get_auth_user + +from lib.shared import base_folder, tootsFetchSettings, toots_folder + +# Define the URL you want to check for in the messages +keyword = tootsFetchSettings['listen'] + '/@' + +# Define the path for the data folder and archive subfolder +data_folder = os.path.join(toots_folder, tootsFetchSettings['folder']) + +archive_folder = os.path.join(data_folder, "archive") + +# Ensure the data and archive folders exist +os.makedirs(data_folder, exist_ok=True) +os.makedirs(archive_folder, exist_ok=True) + +# Get a list of existing message IDs in the data folder +existing_ids = [filename.split(".")[0] for filename in os.listdir(data_folder) if filename.endswith(".json")] + +# Create an instance of Mastodon +mastodon = get_auth_user() + +# Custom JSON encoder to handle datetime objects +class DateTimeEncoder(json.JSONEncoder): + def default(self, obj): + if isinstance(obj, datetime): + return obj.strftime('%Y-%m-%d %H:%M:%S') + return super().default(obj) + +# Search for messages containing the target URL +search_results = mastodon.search(keyword) + +# Check each search result for the target URL +for status in search_results['statuses']: + # Generate a unique filename using ID + message_id = str(status['id']) + filename = f'{message_id}.json' + filepath = os.path.join(data_folder, filename) + + # Save the relevant information to a JSON file + with open(filepath, 'w') as output_file: + json.dump(status, output_file, cls=DateTimeEncoder) + + print(f'Message with ID {message_id} saved to {filepath}') + + # Remove the ID from the existing IDs list if found + if message_id in existing_ids: + existing_ids.remove(message_id) + +# Move any remaining files in the data folder to the archive folder +for message_id in existing_ids: + filename = f'{message_id}.json' + filepath = os.path.join(data_folder, filename) + archive_filepath = os.path.join(archive_folder, filename) + shutil.move(filepath, archive_filepath) + print(f'Message with ID {message_id} moved to archive folder') diff --git a/lib/Toot.py b/lib/Toot.py new file mode 100644 index 0000000..987d7d6 --- /dev/null +++ b/lib/Toot.py @@ -0,0 +1,152 @@ +from .shared import toots_folder, tootsFetchSettings +#from .mastodon_auth import get_api_app +from .mastodon_base import * + +from datetime import datetime +from html.parser import HTMLParser + +class URLExtractor(HTMLParser): + def __init__(self): + super().__init__() + self.urls = [] + + def handle_starttag(self, tag, attrs): + if tag == 'a': + self.urls.extend(value for name, value in attrs if name == 'href') + +class Toot(object): + data = None + + @staticmethod + def loadFromFile(id, folder = tootsFetchSettings['folder']): + return Toot(readTootFromFolder(id, folder)) + + @staticmethod + def loadFromFolder(folder = tootsFetchSettings['folder']): + return [Toot(j) for j in readTootsFromFolder(folder)] + + @staticmethod + def loadFromMastodon(id): + return Toot(readTootFromMastodon(id)) + + + def __init__(self, data): + self.data = data + self.children = [] + + def save(self, folder = tootsFetchSettings['folder']): + writeTootToFolder(self.data, folder) + + def delete(self, folder = tootsFetchSettings['folder']): + deleteTootFromFolder(self.getId(), folder) + + def addChild(self, toot): + self.children.append(toot) + + def getChildren(self): + return self.children + + def hasChildren(self): + return len(self.children) > 0 + + def asApiDict(self, add = {}): + return { + 'url': self.getPathOfLinksTo()[0], + 'hashtags': self.getHashtags(), + 'timestamp': str(self.getDateTime()), + 'content': self.getText(), + 'account': self.getUserName(), + 'display_name': self.getUserScreenName(), + 'avatar': self.getUserImageHttps(), + 'media': self.getMedia(), + **add + } + + def getData(self): + return self.data + + def toStringList(self): + return [ + "{}, {}, {}".format(self.getUserName(), self.getDateTime(), self.getId()), + "# {}".format(', '.join(self.getHashtags())), + "▶ {}".format(', '.join(self.getLinks())), + "¶ {}".format(self.getText()) + ] + + def __str__(self): + return '\n'.join(self.toStringList()) + + def getId(self): + return self.data['id'] + + def getUserId(self): + return self.data['account']['id'] + + def getUserName(self): + return self.data['account']['username'] + + def getUserScreenName(self): + return self.data['account']['display_name'] + + def getUserImageHttps(self): + return self.data['account']['avatar_static'] + + def getDateTime(self): + return datetime.strptime(self.data['created_at'], '%Y-%m-%d %H:%M:%S') + + def getMedia(self): + if 'media_attachments' in self.data: + return [item['preview_url'] for item in self.data['media_attachments'] if 'preview_url' in item] + return [] + + def getText(self): + if 'content' in self.data: + return self.data['content'] + elif 'extended_toot' in self.data: + if 'full_text' in self.data['extended_toot']: + return self.data['extended_toot']['full_text'] + elif 'retooted_status' in self.data and 'extended_toot' in self.data['retooted_status'] and self.data['retooted_status']['extended_toot']['full_text']: + return self.data['retooted_status']['extended_toot']['full_text'] + elif 'text' in self.data: + return self.data['text'] + + def isReply(self): + return 'in_reply_to_id' in self.data + + def getReplyToId(self): + return self.data['in_reply_to_id'] + + def getReplyToUserId(self): + return self.data['in_reply_to_account_id'] + + def isSelfReply(self): + return self.isReply() and self.getUserId() == self.getReplyToUserId() + + def getHashtags(self): + hashtags = [] + if self.data['tags']: + hashtags.extend([h['name'] for h in self.data['tags']]) + if 'extended_toot' in self.data and self.data['extended_toot']['entities']['hashtags']: + hashtags.extend([h['text'] for h in self.data['extended_toot']['entities']['hashtags']]) + return hashtags + + def hasHashtag(self, hashtag): + return hashtag in self.getHashtags() + + def getLinks(self): + urls = [] + if 'content' in self.data: + extractor = URLExtractor() + extractor.feed(self.data['content']) + urls.extend(extractor.urls) + return urls + + def getPathOfLinksTo(self, to = tootsFetchSettings['link']): + return [u.split(to, 1)[1] for u in self.getLinks() if to in u] + + def hasLinkTo(self, to = f"{tootsFetchSettings['link']}/@"): + return len(self.getPathOfLinksTo(to)) > 0 + + def getUrl(self): + if 'url' in self.data: + return self.data['url'] diff --git a/lib/TootForest.py b/lib/TootForest.py new file mode 100644 index 0000000..b70902a --- /dev/null +++ b/lib/TootForest.py @@ -0,0 +1,218 @@ +from .shared import tootsFetchSettings +from .mastodon_base import readTootsFromFolder, writeTootsApiJson +from .Toot import Toot +from collections import OrderedDict + +class TootForest(object): + trunks = [] + + @staticmethod + def fromFolder(folder = tootsFetchSettings['folder']): + toots = Toot.loadFromFolder(folder) + toots = filter(lambda t: t.hasLinkTo(), toots) + # toots = filter(lambda t: not t.hasHashtag('private'), toots) + return TootForest(toots) + + def __init__(self, toots): + self.trunks = [] + tootsDict = {t.getId(): t for t in toots} + + for toot in tootsDict.values(): + if toot.isSelfReply(): + if toot.getReplyToId() in tootsDict: + tootsDict[toot.getReplyToId()].addChild(toot) + continue + self.trunks.append(toot) + + def __str__(self): + def printLeafs(toots, i = 0): + a = [] + for toot in toots: + a.extend(["{}{}".format(' ' * i, l) for l in toot.toStringList()]) + a.append('') + if toot.hasChildren(): + a.extend(printLeafs(toot.getChildren(), i + 1)) + return a + return '\n'.join(printLeafs(self.trunks)) + + def asApiJson(self): + apiJson = {}; + for toot in self.trunks: + if toot.hasChildren(): + story = toot.getId() + apiJson[story] = toot.asApiDict({'story': str(story)}) + while toot.hasChildren(): + # ONLY FIRST CHILD + toot = toot.getChildren()[0] + apiJson[toot.getId()] = toot.asApiDict({'story': str(story)}) + else: + apiJson[toot.getId()] = toot.asApiDict() + + return apiJson + + def rename_ids(self, data): + # Create a dictionary to store the count of each story ID + story_counts = {} + + # Create a dictionary to store the length of each story + story_lengths = {} + + # Calculate the length of each story before the loop + for tweet_id, values in data.items(): + # Check if "story" key is present, otherwise use tweet_id as the story ID + story_id = values.get("story", tweet_id) + + # Update the length of the story + story_lengths[story_id] = story_lengths.get(story_id, 0) + 1 + + # Iterate through the original dictionary and create a new one with modified keys + result = {} + for tweet_id, values in data.items(): + # Check if "story" key is present, otherwise use tweet_id as the story ID + story_id = values.get("story", tweet_id) + + # Calculate the count as the difference between the length of the story and the current length + count = story_lengths[story_id] + story_lengths[story_id] = story_lengths[story_id] - 1 + + # Create the new key in the format "story_id.count_tweet_id" + new_key = f"{story_id}.{count}_{tweet_id}" + result[new_key] = values + + return result + + def revert_ids(self, data): + # Iterate through the dictionary and create a new one with reverted keys + result = {} + for key, values in data.items(): + # Split the key into story_id.count_tweet_id + parts = key.split('_') + + # Extract the tweet_id from the second part + tweet_id = parts[1] + + # Create the new key with the original tweet_id + new_key = tweet_id + result[new_key] = values + + return result + + + def saveApiJson(self, file = tootsFetchSettings['file']): + data = self.asApiJson() + renamed_data = self.rename_ids(data) + # Sort toots based on id in descending order + sorted_toots = OrderedDict(sorted(renamed_data.items(), key=lambda x: x[0], reverse=True)) + final_toots = self.revert_ids(sorted_toots) + + writeTootsApiJson(final_toots, file) + + +# from .shared import tweetsFetchSettings +# from .mastodon_base import readTweetsFromFolder, writeTweetsApiJson +# from .Tweet import Tweet +# from collections import OrderedDict + +# class TweetForest(object): +# trunks = [] + +# @staticmethod +# def fromFolder(folder = tweetsFetchSettings['folder']): +# tweets = Tweet.loadFromFolder(folder) +# tweets = filter(lambda t: t.hasLinkTo(), tweets) +# # tweets = filter(lambda t: not t.hasHashtag('private'), tweets) +# return TweetForest(tweets) + +# def __init__(self, tweets): +# self.trunks = [] +# tweetsDict = {t.getId(): t for t in tweets} + +# for tweet in tweetsDict.values(): +# if tweet.isSelfReply(): +# if tweet.getReplyToId() in tweetsDict: +# tweetsDict[tweet.getReplyToId()].addChild(tweet) +# continue + +# self.trunks.append(tweet) + +# def __str__(self): +# def printLeafs(tweets, i = 0): +# a = [] +# for tweet in tweets: +# a.extend(["{}{}".format(' ' * i, l) for l in tweet.toStringList()]) +# a.append('') +# if tweet.hasChildren(): +# a.extend(printLeafs(tweet.getChildren(), i + 1)) +# return a +# return '\n'.join(printLeafs(self.trunks)) + +# def asApiJson(self): +# apiJson = {}; +# for tweet in self.trunks: +# if tweet.hasChildren(): +# story = tweet.getId() +# apiJson[story] = tweet.asApiDict({'story': story}) +# while tweet.hasChildren(): +# # ONLY FIRST CHILD +# tweet = tweet.getChildren()[0] +# apiJson[tweet.getId()] = tweet.asApiDict({'story': story}) +# else: +# apiJson[tweet.getId()] = tweet.asApiDict() + +# return apiJson + +# def rename_ids(self, data): +# # Create a dictionary to store the count of each story ID +# story_counts = {} + +# # Create a dictionary to store the length of each story +# story_lengths = {} + +# # Calculate the length of each story before the loop +# for tweet_id, values in data.items(): +# # Check if "story" key is present, otherwise use tweet_id as the story ID +# story_id = values.get("story", tweet_id) + +# # Update the length of the story +# story_lengths[story_id] = story_lengths.get(story_id, 0) + 1 + +# # Iterate through the original dictionary and create a new one with modified keys +# result = {} +# for tweet_id, values in data.items(): +# # Check if "story" key is present, otherwise use tweet_id as the story ID +# story_id = values.get("story", tweet_id) + +# # Calculate the count as the difference between the length of the story and the current length +# count = story_lengths[story_id] +# story_lengths[story_id] = story_lengths[story_id] - 1 + +# # Create the new key in the format "story_id.count_tweet_id" +# new_key = f"{story_id}.{count}_{tweet_id}" +# result[new_key] = values + +# return result + +# def revert_ids(self, data): +# # Iterate through the dictionary and create a new one with reverted keys +# result = {} +# for key, values in data.items(): +# # Split the key into story_id.count_tweet_id +# parts = key.split('_') + +# # Extract the tweet_id from the second part +# tweet_id = parts[1] + +# # Create the new key with the original tweet_id +# new_key = tweet_id +# result[new_key] = values + +# return result + +# def saveApiJson(self, file = tweetsFetchSettings['file']): +# data = self.asApiJson() +# renamed_data = self.rename_ids(data) +# # Sort tweets based on id in descending order +# sorted_tweets = OrderedDict(sorted(renamed_data.items(), key=lambda x: x[0], reverse=True)) +# final_tweets = self.revert_ids(sorted_tweets) + +# writeTweetsApiJson(final_tweets, file) \ No newline at end of file diff --git a/lib/Tweet.py b/lib/Tweet.py deleted file mode 100644 index 60e550c..0000000 --- a/lib/Tweet.py +++ /dev/null @@ -1,146 +0,0 @@ -from .shared import tweets_folder, tweetsFetchSettings -from .twitter_auth import get_api_app -from .tweets_base import * - -from datetime import datetime - -class Tweet(object): - data = None - - @staticmethod - def loadFromFile(id, folder = tweetsFetchSettings['folder']): - return Tweet(readTweetFromFolder(id, folder)) - - @staticmethod - def loadFromFolder(folder = tweetsFetchSettings['folder']): - return [Tweet(j) for j in readTweetsFromFolder(folder)] - - @staticmethod - def loadFromTwitter(id): - return Tweet(readTweetFromTwitter(id)) - - - def __init__(self, data): - self.data = data - self.children = [] - - def save(self, folder = tweetsFetchSettings['folder']): - writeTweetToFolder(self.data, folder) - - def delete(self, folder = tweetsFetchSettings['folder']): - deleteTweetFromFolder(self.getId(), folder) - - def addChild(self, tweet): - self.children.append(tweet) - - def getChildren(self): - return self.children - - def hasChildren(self): - return len(self.children) > 0 - - def asApiDict(self, add = {}): - return { - 'url': self.getPathOfLinksTo()[0], - 'hashtags': self.getHashtags(), - 'timestamp': str(self.getDateTime()), - 'content': self.getText(), - 'account': self.getUserName(), - 'display_name': self.getUserScreenName(), - 'avatar': self.getUserImageHttps(), - 'media': self.getMedia(), - **add - } - - def getData(self): - return self.data - - def toStringList(self): - return [ - "{}, {}, {}".format(self.getUserName(), self.getDateTime(), self.getId()), - "# {}".format(', '.join(self.getHashtags())), - "▶ {}".format(', '.join(self.getLinks())), - "¶ {}".format(self.getText()) - ] - - def __str__(self): - return '\n'.join(self.toStringList()) - - def getId(self): - return self.data['id_str'] - - def getUserId(self): - return self.data['user']['id_str'] - - def getUserName(self): - return self.data['user']['name'] - - def getUserScreenName(self): - return self.data['user']['screen_name'] - - def getUserImageHttps(self): - return self.data['user']['profile_image_url_https'] - - def getDateTime(self): - return datetime.strptime(self.data['created_at'], '%a %b %d %H:%M:%S +0000 %Y') - - def getMedia(self): - if 'extended_tweet' in self.data and 'extended_entities' in self.data['extended_tweet'] and 'media' in self.data['extended_tweet']['extended_entities']: - return([item['media_url_https'] for item in self.data['extended_tweet']['extended_entities']['media']]) - elif 'extended_entities' in self.data and 'media' in self.data['extended_entities']: - return([item['media_url_https'] for item in self.data['extended_entities']['media']]) - elif 'quoted_status' in self.data and 'extended_entities' in self.data['quoted_status'] and 'media' in self.data['quoted_status']['extended_entities']: - return([item['media_url_https'] for item in self.data['quoted_status']['extended_entities']['media']]) - elif 'quoted_status' in self.data and 'extended_tweet' in self.data['quoted_status'] and 'extended_entities' in self.data['quoted_status']['extended_tweet'] and 'media' in self.data['quoted_status']['extended_tweet']['extended_entities']: - return([item['media_url_https'] for item in self.data['quoted_status']['extended_tweet']['extended_entities']['media']]) - - def getText(self): - if 'full_text' in self.data: - return self.data['full_text'] - elif 'extended_tweet' in self.data: - if 'full_text' in self.data['extended_tweet']: - return self.data['extended_tweet']['full_text'] - elif 'retweeted_status' in self.data and 'extended_tweet' in self.data['retweeted_status'] and self.data['retweeted_status']['extended_tweet']['full_text']: - return self.data['retweeted_status']['extended_tweet']['full_text'] - elif 'text' in self.data: - return self.data['text'] - - def isReply(self): - return 'in_reply_to_status_id_str' in self.data - - def getReplyToId(self): - return self.data['in_reply_to_status_id_str'] - - def getReplyToUserId(self): - return self.data['in_reply_to_user_id_str'] - - def isSelfReply(self): - return self.isReply() and self.getUserId() == self.getReplyToUserId() - - def getHashtags(self): - hashtags = [] - if self.data['entities']['hashtags']: - hashtags.extend([h['text'] for h in self.data['entities']['hashtags']]) - if 'extended_tweet' in self.data and self.data['extended_tweet']['entities']['hashtags']: - hashtags.extend([h['text'] for h in self.data['extended_tweet']['entities']['hashtags']]) - return hashtags - - def hasHashtag(self, hashtag): - return hashtag in self.getHashtags() - - def getLinks(self): - urls = [] - if self.data['entities']['urls']: - urls.extend([url['expanded_url'] for url in self.data['entities']['urls']]) - if 'extended_tweet' in self.data and self.data['extended_tweet']['entities']['urls']: - urls.extend([url['expanded_url'] for url in self.data['extended_tweet']['entities']['urls']]) - return urls - - def getPathOfLinksTo(self, to = tweetsFetchSettings['link']): - return [u.split(to, 1)[1] for u in self.getLinks() if to in u] - - def hasLinkTo(self, to = f"{tweetsFetchSettings['link']}/@"): - return len(self.getPathOfLinksTo(to)) > 0 - - def getUrl(self): - return 'https://twitter.com/{}/status/{}'.format(self.getUserScreenName(), self.getId()) diff --git a/lib/TweetForest.py b/lib/TweetForest.py deleted file mode 100644 index 1b3e37a..0000000 --- a/lib/TweetForest.py +++ /dev/null @@ -1,108 +0,0 @@ -from .shared import tweetsFetchSettings -from .tweets_base import readTweetsFromFolder, writeTweetsApiJson -from .Tweet import Tweet -from collections import OrderedDict - -class TweetForest(object): - trunks = [] - - @staticmethod - def fromFolder(folder = tweetsFetchSettings['folder']): - tweets = Tweet.loadFromFolder(folder) - tweets = filter(lambda t: t.hasLinkTo(), tweets) - # tweets = filter(lambda t: not t.hasHashtag('private'), tweets) - return TweetForest(tweets) - - def __init__(self, tweets): - self.trunks = [] - tweetsDict = {t.getId(): t for t in tweets} - - for tweet in tweetsDict.values(): - if tweet.isSelfReply(): - if tweet.getReplyToId() in tweetsDict: - tweetsDict[tweet.getReplyToId()].addChild(tweet) - continue - - self.trunks.append(tweet) - - def __str__(self): - def printLeafs(tweets, i = 0): - a = [] - for tweet in tweets: - a.extend(["{}{}".format(' ' * i, l) for l in tweet.toStringList()]) - a.append('') - if tweet.hasChildren(): - a.extend(printLeafs(tweet.getChildren(), i + 1)) - return a - return '\n'.join(printLeafs(self.trunks)) - - def asApiJson(self): - apiJson = {}; - for tweet in self.trunks: - if tweet.hasChildren(): - story = tweet.getId() - apiJson[story] = tweet.asApiDict({'story': story}) - while tweet.hasChildren(): - # ONLY FIRST CHILD - tweet = tweet.getChildren()[0] - apiJson[tweet.getId()] = tweet.asApiDict({'story': story}) - else: - apiJson[tweet.getId()] = tweet.asApiDict() - - return apiJson - - def rename_ids(self, data): - # Create a dictionary to store the count of each story ID - story_counts = {} - - # Create a dictionary to store the length of each story - story_lengths = {} - - # Calculate the length of each story before the loop - for tweet_id, values in data.items(): - # Check if "story" key is present, otherwise use tweet_id as the story ID - story_id = values.get("story", tweet_id) - - # Update the length of the story - story_lengths[story_id] = story_lengths.get(story_id, 0) + 1 - - # Iterate through the original dictionary and create a new one with modified keys - result = {} - for tweet_id, values in data.items(): - # Check if "story" key is present, otherwise use tweet_id as the story ID - story_id = values.get("story", tweet_id) - - # Calculate the count as the difference between the length of the story and the current length - count = story_lengths[story_id] - story_lengths[story_id] = story_lengths[story_id] - 1 - - # Create the new key in the format "story_id.count_tweet_id" - new_key = f"{story_id}.{count}_{tweet_id}" - result[new_key] = values - - return result - - def revert_ids(self, data): - # Iterate through the dictionary and create a new one with reverted keys - result = {} - for key, values in data.items(): - # Split the key into story_id.count_tweet_id - parts = key.split('_') - - # Extract the tweet_id from the second part - tweet_id = parts[1] - - # Create the new key with the original tweet_id - new_key = tweet_id - result[new_key] = values - - return result - - def saveApiJson(self, file = tweetsFetchSettings['file']): - data = self.asApiJson() - renamed_data = self.rename_ids(data) - # Sort tweets based on id in descending order - sorted_tweets = OrderedDict(sorted(renamed_data.items(), key=lambda x: x[0], reverse=True)) - final_tweets = self.revert_ids(sorted_tweets) - - writeTweetsApiJson(final_tweets, file) diff --git a/lib/mastodon_auth.py b/lib/mastodon_auth.py new file mode 100644 index 0000000..db7e41f --- /dev/null +++ b/lib/mastodon_auth.py @@ -0,0 +1,15 @@ +from mastodon import Mastodon +from .shared import base_folder, mastodonAuth +import json +import os + +with open(os.path.join(base_folder, 'config.json'), 'r') as f: + config = json.load(f) + +def get_auth_user(): + return Mastodon( + client_id = config['mastodon']['client']['key'], + client_secret = config['mastodon']['client']['secret'], + access_token = config['mastodon']['access']['token'], + api_base_url = config['mastodon']['instance_url'] + ) diff --git a/lib/mastodon_base.py b/lib/mastodon_base.py new file mode 100644 index 0000000..05aafd9 --- /dev/null +++ b/lib/mastodon_base.py @@ -0,0 +1,55 @@ +import os +import logging +import json +from datetime import datetime + +from .shared import toots_folder, data_folder, tootsFetchSettings +from .mastodon_auth import get_auth_user + +class DateTimeEncoder(json.JSONEncoder): + def default(self, obj): + if isinstance(obj, datetime): + return obj.strftime('%Y-%m-%d %H:%M:%S') + return super().default(obj) + + +def readTootFromFolder(id, folder = tootsFetchSettings['folder']): + ff = os.path.join(toots_folder, folder) + with open(os.path.join(ff, '{}.json'.format(id)), 'r') as f: + return json.load(f) + +def writeTootToFolder(toot, folder = tootsFetchSettings['folder']): + ff = os.path.join(toots_folder, folder) + with open(os.path.join(ff, '{}.json'.format(toot['id'])), 'w') as f: + json.dump(toot, f, cls=DateTimeEncoder) + +def writeTootToArchive(toot, folder = tootsFetchSettings['folder']): + ff = os.path.join(toots_folder, folder, "archive") + with open(os.path.join(ff, '{}.json'.format(toot['id'])), 'w') as f: + json.dump(toot, f, cls=DateTimeEncoder) + +def deleteTootFromFolder(id, folder = tootsFetchSettings['folder']): + ff = os.path.join(toots_folder, folder) + os.remove(os.path.join(ff, '{}.json'.format(id))) + +def readTootsFromFolder(folder = tootsFetchSettings['folder']): + ff = os.path.join(toots_folder, folder) + toots = [readTootFromFolder(f[:-5], folder) for f in os.listdir(ff) if f[-5:] == '.json'] + logging.info('Read {} toots from folder: \'{}\''.format(len(toots), folder)) + return toots + + +def readTootFromMastodon(id): + mastodon = get_auth_user() + logging.info('Reading toot info from Mastodon API: {}'.format(id)) + return mastodon.status(id) + +def writeTootsApiJson(data, tootsApiFile = tootsFetchSettings['file']): + filePath = os.path.join(data_folder, tootsApiFile) + logging.info('Writing new toots API file: \'{}\''.format(tootsApiFile)) + with open(filePath, 'w') as f: + json.dump(data, f) + +def readTootsApiJson(tootsApiFile = tootsFetchSettings['file']): + with open(os.path.join(data_folder, tootsApiFile), 'r') as f: + return json.load(f) diff --git a/lib/shared.py b/lib/shared.py index d02db0c..f26947a 100644 --- a/lib/shared.py +++ b/lib/shared.py @@ -5,13 +5,13 @@ base_folder = os.path.join(lib_folder, '..') data_folder = os.path.join(base_folder, 'data') -tweets_folder = os.path.join(data_folder, 'tweets') +toots_folder = os.path.join(data_folder, 'toots') with open(os.path.join(base_folder, 'config.json'), 'r') as f: config = json.load(f) -tweetsFetchSettings = config['tweetsFetchSettings']['list'][config['tweetsFetchSettings']['default']] +tootsFetchSettings = config['tootsFetchSettings']['list'][config['tootsFetchSettings']['default']] authToken = config['authToken'] secretKey = config['secretKey'] -twitterAuth = config['twitter'] +mastodonAuth = config['mastodon'] diff --git a/lib/tweets_aux.py b/lib/toots_aux.py similarity index 70% rename from lib/tweets_aux.py rename to lib/toots_aux.py index e128f13..dd4e27c 100644 --- a/lib/tweets_aux.py +++ b/lib/toots_aux.py @@ -1,12 +1,11 @@ -import json import os import logging import GetOldTweets3 as got -from .shared import tweets_folder, tweetsFetchSettings -from .tweets_base import * +from .shared import toots_folder, tootsFetchSettings +from .mastodon_base import * -def getOldTweetIds(searchString = tweetsFetchSettings['link'], max = 100): +def getOldTweetIds(searchString = tootsFetchSettings['link'], max = 100): tweetCriteria = got.manager.TweetCriteria().setQuerySearch(searchString)\ .setMaxTweets(max) logging.info('Loading old tweet ids from twitter (GoT):') @@ -21,8 +20,8 @@ def getOldTweetIds(searchString = tweetsFetchSettings['link'], max = 100): return [t.id for t in tweets] -def populateTweetsFolder(folder = tweetsFetchSettings['folder'], ids = [], init = False, refresh = False): - d = os.path.join(tweets_folder, folder) +def populateTweetsFolder(folder = tootsFetchSettings['folder'], ids = [], init = False, refresh = False): + d = os.path.join(toots_folder, folder) if not os.path.exists(d): os.mkdir(d) ids_e = [f[:-5] for f in os.listdir(d)] @@ -37,5 +36,5 @@ def populateTweetsFolder(folder = tweetsFetchSettings['folder'], ids = [], init ids_f = list(set(ids) - set(ids_e)) for id in ids_f: - info = readTweetFromTwitter(id) - writeTweetToFolder(info, folder) + info = readTootFromMastodon(id) + writeTootToFolder(info, folder) diff --git a/lib/tweets_base.py b/lib/tweets_base.py deleted file mode 100644 index da29c50..0000000 --- a/lib/tweets_base.py +++ /dev/null @@ -1,43 +0,0 @@ -import os -import logging -import json - -from .shared import tweets_folder, data_folder, tweetsFetchSettings -from .twitter_auth import get_api_app - -def readTweetFromFolder(id, folder = tweetsFetchSettings['folder']): - ff = os.path.join(tweets_folder, folder) - with open(os.path.join(ff, '{}.json'.format(id)), 'r') as f: - return json.load(f) - -def writeTweetToFolder(tweet, folder = tweetsFetchSettings['folder']): - ff = os.path.join(tweets_folder, folder) - with open(os.path.join(ff, '{}.json'.format(tweet['id_str'])), 'w') as f: - json.dump(tweet, f) - -def deleteTweetFromFolder(id, folder = tweetsFetchSettings['folder']): - ff = os.path.join(tweets_folder, folder) - os.remove(os.path.join(ff, '{}.json'.format(id))) - -def readTweetsFromFolder(folder = tweetsFetchSettings['folder']): - ff = os.path.join(tweets_folder, folder) - tweets = [readTweetFromFolder(f[:-5], folder) for f in os.listdir(ff) if f[-5:] == '.json'] - logging.info('Read {} tweets from folder: \'{}\''.format(len(tweets), folder)) - return tweets - - -def readTweetFromTwitter(id): - api = get_api_app() - logging.info('Reading tweet info from twitter API: {}'.format(id)) - return api.get_status(id, include_entities=True, tweet_mode='extended')._json - - -def writeTweetsApiJson(data, tweetsApiFile = tweetsFetchSettings['file']): - filePath = os.path.join(data_folder, tweetsApiFile) - logging.info('Writing new tweets API file: \'{}\''.format(tweetsApiFile)) - with open(filePath, 'w') as f: - json.dump(data, f) - -def readTweetsApiJson(tweetsApiFile = tweetsFetchSettings['file']): - with open(os.path.join(data_folder, tweetsApiFile), 'r') as f: - return json.load(f) diff --git a/lib/twitter_auth.py b/lib/twitter_auth.py deleted file mode 100644 index c6f965b..0000000 --- a/lib/twitter_auth.py +++ /dev/null @@ -1,18 +0,0 @@ -from tweepy import AppAuthHandler, OAuthHandler, API -from .shared import base_folder, twitterAuth -import json -import os - - -def get_auth_app(): - ad = twitterAuth['api'] - return AppAuthHandler(ad['key'], ad['secret']) - -def get_api_app(): - return API(get_auth_app()) - -def get_auth_user(): - ad = twitterAuth - auth = OAuthHandler(ad['api']['key'], ad['api']['secret']) - auth.set_access_token(ad['access']['token'], ad['access']['token-secret']) - return auth diff --git a/runInit.py b/runInit.py index c1ca65b..2f7af00 100755 --- a/runInit.py +++ b/runInit.py @@ -1,11 +1,10 @@ #!./venv/bin/python -import json import os import logging import GetOldTweets3 as got from lib.shared import base_folder, tweetsFetchSettings -from lib.tweets_aux import getOldTweetIds, populateTweetsFolder +from lib.toots_aux import getOldTweetIds, populateTweetsFolder logging.basicConfig( level=logging.INFO, diff --git a/runListener.py b/runListener.py index aa064fa..e546361 100755 --- a/runListener.py +++ b/runListener.py @@ -1,14 +1,25 @@ -#!./venv/bin/python -import json -import tweepy +##!./venv/bin/python + import os import logging +import time +from mastodon import StreamListener + +from lib.shared import base_folder, tootsFetchSettings, toots_folder +from lib.mastodon_auth import get_auth_user +from lib.mastodon_base import writeTootToFolder, writeTootToArchive, readTootFromFolder, deleteTootFromFolder +from lib.TootForest import TootForest + + +mastodon = get_auth_user() -from lib.shared import base_folder, tweetsFetchSettings -from lib.twitter_auth import get_auth_user -from lib.tweets_base import writeTweetToFolder -from lib.TweetForest import TweetForest +keyword = tootsFetchSettings['listen'] +dir_path = os.path.join(toots_folder, tootsFetchSettings['folder']) +# Get all files in directory +files = os.listdir(dir_path) +# Extract ids (filenames without .json) +existing_ids = [os.path.splitext(file)[0] for file in files] logging.basicConfig( level=logging.INFO, @@ -19,36 +30,73 @@ ] ) -searchString = tweetsFetchSettings['listen'] +searchString = tootsFetchSettings['listen'] def update(): - forest = TweetForest.fromFolder(tweetsFetchSettings['folder']) - forest.saveApiJson(tweetsFetchSettings['file']) + forest = TootForest.fromFolder(tootsFetchSettings['folder']) + forest.saveApiJson(tootsFetchSettings['file']) + +class DecarbnowStreamListener(StreamListener): + def __init__(self, mastodon): + self.mastodon = mastodon + + def stream_with_reconnection(self): + retry_delay = 5 + max_retries = 10 + retry_count = 0 + + while retry_count < max_retries: + try: + self.mastodon.stream_public(self) + break + except Exception as e: + print("Error: ", e) + print("Versuche, die Verbindung nach 10 Sekunden erneut herzustellen...") + retry_count += 1 + time.sleep(retry_delay) -class DecarbnowStreamListener(tweepy.StreamListener): - def on_status(self, tweet): - id = tweet._json['id'] - logging.info('Listener got new tweet: {}'.format(id)) - writeTweetToFolder(tweet._json, tweetsFetchSettings['folder']) - update() + def on_update(self, toot): + if keyword in toot['content']: + id = toot['id'] + logging.info('Listener got new toot: {}'.format(id)) + writeTootToFolder(toot, tootsFetchSettings['folder']) + update() - def on_error(self, status_code): + def on_status_update(self, toot): + if keyword in toot['content']: + id = toot['id'] + logging.info('Listener got new toot: {}'.format(id)) + writeTootToFolder(toot, tootsFetchSettings['folder']) + update() + + def on_delete(self, status_id): + str_status = str(status_id) + if str_status in existing_ids: + logging.info('Archiving toot (id: {})!'.format(str_status)) + toot = readTootFromFolder(str_status) + archive_dir = os.path.join(toots_folder, tootsFetchSettings['folder'], 'archive') + os.makedirs(archive_dir, exist_ok=True) + + writeTootToArchive(toot, tootsFetchSettings['folder']) + deleteTootFromFolder(status_id, tootsFetchSettings['folder']) + + update() + + def handle_heartbeat(self): + print("Received a heartbeat from the server!") + + def on_abort(self, status_code): logging.warning('Listener got error, status code: {}'.format(status_code)) return True - logging.info('Init Listener:') -logging.info(' - Tweets Folder: \'{}\''.format(tweetsFetchSettings['folder'])) +logging.info(' - Toots Folder: \'{}\''.format(tootsFetchSettings['folder'])) logging.info(' - Search String: \'{}\''.format(searchString)) -listener = DecarbnowStreamListener() -stream = tweepy.Stream(auth = get_auth_user(), listener = listener, tweet_mode = 'extended') +listener = DecarbnowStreamListener(mastodon) -logging.info('Init Tweets API File ...') +logging.info('Init Toots API File ...') update() logging.info('Start Listener ...') -try: - stream.filter(track=[searchString]) -except KeyboardInterrupt: - pass +listener.stream_with_reconnection() \ No newline at end of file From 65671e86466147d51381bd4f742b678d5ae10f53 Mon Sep 17 00:00:00 2001 From: sweing Date: Tue, 9 Jan 2024 20:20:50 +0100 Subject: [PATCH 02/12] only heardbeat on connect --- runListener.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/runListener.py b/runListener.py index e546361..6a2fb67 100755 --- a/runListener.py +++ b/runListener.py @@ -39,6 +39,7 @@ def update(): class DecarbnowStreamListener(StreamListener): def __init__(self, mastodon): self.mastodon = mastodon + self.receivedHeartbeat = False def stream_with_reconnection(self): retry_delay = 5 @@ -50,6 +51,7 @@ def stream_with_reconnection(self): self.mastodon.stream_public(self) break except Exception as e: + self.receivedHeartbeat = False print("Error: ", e) print("Versuche, die Verbindung nach 10 Sekunden erneut herzustellen...") retry_count += 1 @@ -83,7 +85,9 @@ def on_delete(self, status_id): update() def handle_heartbeat(self): - print("Received a heartbeat from the server!") + if not self.receivedHeartbeat: + print("Received a heartbeat from the server!") + self.receivedHeartbeat = True def on_abort(self, status_code): logging.warning('Listener got error, status code: {}'.format(status_code)) From bf0d75c6c112444fdd6dca84edf73613b215fb68 Mon Sep 17 00:00:00 2001 From: sweing Date: Tue, 9 Jan 2024 20:38:28 +0100 Subject: [PATCH 03/12] more beautiful messaging --- runListener.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runListener.py b/runListener.py index 6a2fb67..40caf29 100755 --- a/runListener.py +++ b/runListener.py @@ -86,7 +86,7 @@ def on_delete(self, status_id): def handle_heartbeat(self): if not self.receivedHeartbeat: - print("Received a heartbeat from the server!") + logging.info("Connected to server. Listening.") self.receivedHeartbeat = True def on_abort(self, status_code): From 74644560c5653b6d1a4372e30fea8505c5a58ea4 Mon Sep 17 00:00:00 2001 From: sweing Date: Tue, 9 Jan 2024 20:40:01 +0100 Subject: [PATCH 04/12] more beautiful messaging2 --- runListener.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/runListener.py b/runListener.py index 40caf29..e6f1f0f 100755 --- a/runListener.py +++ b/runListener.py @@ -42,7 +42,7 @@ def __init__(self, mastodon): self.receivedHeartbeat = False def stream_with_reconnection(self): - retry_delay = 5 + retry_delay = 10 max_retries = 10 retry_count = 0 @@ -52,8 +52,8 @@ def stream_with_reconnection(self): break except Exception as e: self.receivedHeartbeat = False - print("Error: ", e) - print("Versuche, die Verbindung nach 10 Sekunden erneut herzustellen...") + logging.warning("Error: ", e) + logging.info("Versuche, die Verbindung nach 10 Sekunden erneut herzustellen...") retry_count += 1 time.sleep(retry_delay) From a1c53f860405cd51815619ffc2ceb9812e015d93 Mon Sep 17 00:00:00 2001 From: sweing Date: Wed, 10 Jan 2024 10:04:50 +0100 Subject: [PATCH 05/12] messaging --- runListener.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/runListener.py b/runListener.py index e6f1f0f..29829ba 100755 --- a/runListener.py +++ b/runListener.py @@ -36,7 +36,7 @@ def update(): forest = TootForest.fromFolder(tootsFetchSettings['folder']) forest.saveApiJson(tootsFetchSettings['file']) -class DecarbnowStreamListener(StreamListener): +class MastodonStreamListener(StreamListener): def __init__(self, mastodon): self.mastodon = mastodon self.receivedHeartbeat = False @@ -53,7 +53,7 @@ def stream_with_reconnection(self): except Exception as e: self.receivedHeartbeat = False logging.warning("Error: ", e) - logging.info("Versuche, die Verbindung nach 10 Sekunden erneut herzustellen...") + logging.info("Trying to reconnect after " + retry_delay + "seconds.") retry_count += 1 time.sleep(retry_delay) @@ -88,6 +88,7 @@ def handle_heartbeat(self): if not self.receivedHeartbeat: logging.info("Connected to server. Listening.") self.receivedHeartbeat = True + retry_count = 0 def on_abort(self, status_code): logging.warning('Listener got error, status code: {}'.format(status_code)) @@ -97,7 +98,7 @@ def on_abort(self, status_code): logging.info(' - Toots Folder: \'{}\''.format(tootsFetchSettings['folder'])) logging.info(' - Search String: \'{}\''.format(searchString)) -listener = DecarbnowStreamListener(mastodon) +listener = MastodonStreamListener(mastodon) logging.info('Init Toots API File ...') update() From 062957e97294ba777b51c6fac1f7ccf1148d3d37 Mon Sep 17 00:00:00 2001 From: sweing Date: Wed, 10 Jan 2024 13:56:22 +0100 Subject: [PATCH 06/12] reconnect on_abord --- runListener.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/runListener.py b/runListener.py index 29829ba..547f60e 100755 --- a/runListener.py +++ b/runListener.py @@ -53,7 +53,7 @@ def stream_with_reconnection(self): except Exception as e: self.receivedHeartbeat = False logging.warning("Error: ", e) - logging.info("Trying to reconnect after " + retry_delay + "seconds.") + logging.info("Trying to reconnect after " + str(retry_delay) + "seconds.") retry_count += 1 time.sleep(retry_delay) @@ -92,7 +92,7 @@ def handle_heartbeat(self): def on_abort(self, status_code): logging.warning('Listener got error, status code: {}'.format(status_code)) - return True + self.stream_with_reconnection() logging.info('Init Listener:') logging.info(' - Toots Folder: \'{}\''.format(tootsFetchSettings['folder'])) From 64c38a7108f2be11a7e567acd37e27306783f4db Mon Sep 17 00:00:00 2001 From: sweing Date: Wed, 10 Jan 2024 18:42:12 +0100 Subject: [PATCH 07/12] combine tweets & toots --- api/modules/toots.py | 59 ++++++++++++++++++++++++++++++++++++++++++-- lib/Toot.py | 1 + runListener.py | 4 +-- 3 files changed, 60 insertions(+), 4 deletions(-) diff --git a/api/modules/toots.py b/api/modules/toots.py index 920d4e3..3643bd7 100644 --- a/api/modules/toots.py +++ b/api/modules/toots.py @@ -16,9 +16,36 @@ @cross_origin() @login_exempt def root(): + # Read the current tweets from the API or other source + current_tweets = readTootsApiJson() + + # Read additional tweets from file_tweets.json + try: + with open('data/file_tweets.json', 'r') as file: + file_tweets_data = json.load(file) + # Check if the data is a dictionary + if isinstance(file_tweets_data, dict): + # If the current tweets are also a dictionary, update it + if isinstance(current_tweets, dict): + current_tweets.update(file_tweets_data) + # If the current tweets are a list, convert the dictionary to a list and extend it + elif isinstance(current_tweets, list): + current_tweets.extend(file_tweets_data.values()) + else: + raise ValueError("The current tweets are neither a list nor a dictionary.") + else: + raise ValueError("The contents of file_tweets.json are not a dictionary") + except FileNotFoundError: + logging.error("The file file_tweets.json was not found.") + except json.JSONDecodeError: + logging.error("The file file_tweets.json does not contain valid JSON.") + except ValueError as e: + logging.error(e) + + # Return the combined data as a JSON response return Response(json.dumps({ 'date': int(datetime.timestamp(datetime.now())), - 'tweets': readTootsApiJson() + 'tweets': current_tweets }), mimetype='application/json') @@ -37,7 +64,35 @@ def forest_create(): logging.warning('Manual invocation of creating forest!') forest = TootForest.fromFolder() forest.saveApiJson() - return Response(json.dumps(readTootsApiJson()), mimetype='application/json') + + # Read the current tweets from the API or other source + current_tweets = readTootsApiJson() + + # Read additional tweets from file_tweets.json + try: + with open('data/file_tweets.json', 'r') as file: + file_tweets_data = json.load(file) + # Check if the data is a dictionary + if isinstance(file_tweets_data, dict): + # If the current tweets are also a dictionary, update it + if isinstance(current_tweets, dict): + current_tweets.update(file_tweets_data) + # If the current tweets are a list, convert the dictionary to a list and extend it + elif isinstance(current_tweets, list): + current_tweets.extend(file_tweets_data.values()) + else: + raise ValueError("The current tweets are neither a list nor a dictionary.") + else: + raise ValueError("The contents of file_tweets.json are not a dictionary") + except FileNotFoundError: + logging.error("The file file_tweets.json was not found.") + except json.JSONDecodeError: + logging.error("The file file_tweets.json does not contain valid JSON.") + except ValueError as e: + logging.error(e) + + # Return the combined data as a JSON response + return Response(json.dumps(current_tweets), mimetype='application/json') @bp.route('/add/') def add(id): diff --git a/lib/Toot.py b/lib/Toot.py index 987d7d6..3d61439 100644 --- a/lib/Toot.py +++ b/lib/Toot.py @@ -59,6 +59,7 @@ def asApiDict(self, add = {}): 'display_name': self.getUserScreenName(), 'avatar': self.getUserImageHttps(), 'media': self.getMedia(), + 'source': 'mastodon.social', **add } diff --git a/runListener.py b/runListener.py index 547f60e..19f7259 100755 --- a/runListener.py +++ b/runListener.py @@ -49,11 +49,12 @@ def stream_with_reconnection(self): while retry_count < max_retries: try: self.mastodon.stream_public(self) + retry_count = 0 break except Exception as e: self.receivedHeartbeat = False logging.warning("Error: ", e) - logging.info("Trying to reconnect after " + str(retry_delay) + "seconds.") + logging.info("Trying to reconnect after " + str(retry_delay) + " seconds.") retry_count += 1 time.sleep(retry_delay) @@ -88,7 +89,6 @@ def handle_heartbeat(self): if not self.receivedHeartbeat: logging.info("Connected to server. Listening.") self.receivedHeartbeat = True - retry_count = 0 def on_abort(self, status_code): logging.warning('Listener got error, status code: {}'.format(status_code)) From dcbaaeb2409b87f7f29b9da3859ed6fb550b9116 Mon Sep 17 00:00:00 2001 From: sweing Date: Sun, 14 Jan 2024 16:54:02 +0100 Subject: [PATCH 08/12] update() now updates the existing_ids array --- requirements.txt | 3 +++ runListener.py | 13 ++++++++----- sync.sh | 2 ++ 3 files changed, 13 insertions(+), 5 deletions(-) create mode 100755 sync.sh diff --git a/requirements.txt b/requirements.txt index 0054864..52cc1d1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,5 @@ backcall==0.1.0 +blurhash==1.1.4 certifi==2020.4.5.1 chardet==3.0.4 click==7.1.2 @@ -18,6 +19,7 @@ jupyter-client==6.1.3 jupyter-core==4.6.3 lxml==4.5.1 MarkupSafe==1.1.1 +Mastodon.py==1.8.1 oauthlib==3.1.0 parso==0.7.0 pexpect==4.8.0 @@ -28,6 +30,7 @@ Pygments==2.6.1 pyquery==1.4.1 PySocks==1.7.1 python-dateutil==2.8.1 +python-magic==0.4.27 pyzmq==19.0.1 requests==2.23.0 requests-oauthlib==1.3.0 diff --git a/runListener.py b/runListener.py index 19f7259..50ae4bd 100755 --- a/runListener.py +++ b/runListener.py @@ -16,9 +16,7 @@ keyword = tootsFetchSettings['listen'] dir_path = os.path.join(toots_folder, tootsFetchSettings['folder']) -# Get all files in directory files = os.listdir(dir_path) -# Extract ids (filenames without .json) existing_ids = [os.path.splitext(file)[0] for file in files] logging.basicConfig( @@ -33,8 +31,14 @@ searchString = tootsFetchSettings['listen'] def update(): + global existing_ids forest = TootForest.fromFolder(tootsFetchSettings['folder']) forest.saveApiJson(tootsFetchSettings['file']) + + # Update the global existing_ids variable + files = os.listdir(dir_path) + existing_ids = [os.path.splitext(file)[0] for file in files] + class MastodonStreamListener(StreamListener): def __init__(self, mastodon): @@ -60,7 +64,7 @@ def stream_with_reconnection(self): def on_update(self, toot): if keyword in toot['content']: - id = toot['id'] + id = str(toot['id']) # Convert the ID to a string, if it's not already logging.info('Listener got new toot: {}'.format(id)) writeTootToFolder(toot, tootsFetchSettings['folder']) update() @@ -68,7 +72,7 @@ def on_update(self, toot): def on_status_update(self, toot): if keyword in toot['content']: id = toot['id'] - logging.info('Listener got new toot: {}'.format(id)) + logging.info('Listener got update of toot: {}'.format(id)) writeTootToFolder(toot, tootsFetchSettings['folder']) update() @@ -82,7 +86,6 @@ def on_delete(self, status_id): writeTootToArchive(toot, tootsFetchSettings['folder']) deleteTootFromFolder(status_id, tootsFetchSettings['folder']) - update() def handle_heartbeat(self): diff --git a/sync.sh b/sync.sh new file mode 100755 index 0000000..cf3bb0d --- /dev/null +++ b/sync.sh @@ -0,0 +1,2 @@ +#!/usr/bin/env bash +rsync -a --exclude='/data' . nebula@spica.uberspace.de:~/html/dev2.decarbnow.space From a177ef576ec697c574b9df34552e2b03086bdb2c Mon Sep 17 00:00:00 2001 From: sweing Date: Sat, 27 Sep 2025 12:21:55 +0200 Subject: [PATCH 09/12] implement bluesky --- api/modules/toots.py | 22 ++-- lib/Toot.py | 99 ++++++++++++++-- lib/TootForest.py | 3 +- lib/bluesky_base.py | 164 +++++++++++++++++++++++++ lib/bluesky_did_resolver.py | 124 +++++++++++++++++++ requirements.txt | 98 +++++++++------ runListener.py | 230 ++++++++++++++++++++++++++++++++++-- 7 files changed, 669 insertions(+), 71 deletions(-) create mode 100644 lib/bluesky_base.py create mode 100644 lib/bluesky_did_resolver.py diff --git a/api/modules/toots.py b/api/modules/toots.py index 3643bd7..17c897b 100644 --- a/api/modules/toots.py +++ b/api/modules/toots.py @@ -1,6 +1,7 @@ from datetime import datetime import logging import json +import os from flask import Blueprint, jsonify, Response, render_template, abort from flask_cors import cross_origin @@ -9,6 +10,7 @@ from lib.mastodon_base import readTootsApiJson from lib.Toot import Toot from lib.TootForest import TootForest +from lib.shared import data_folder bp = Blueprint('tweets', __name__, url_prefix='/tweets') @@ -19,9 +21,9 @@ def root(): # Read the current tweets from the API or other source current_tweets = readTootsApiJson() - # Read additional tweets from file_tweets.json + # Read additional tweets from file_tweets_transformed.json try: - with open('data/file_tweets.json', 'r') as file: + with open(os.path.join(data_folder, 'file_tweets_transformed.json'), 'r', encoding='utf-8') as file: file_tweets_data = json.load(file) # Check if the data is a dictionary if isinstance(file_tweets_data, dict): @@ -34,11 +36,11 @@ def root(): else: raise ValueError("The current tweets are neither a list nor a dictionary.") else: - raise ValueError("The contents of file_tweets.json are not a dictionary") + raise ValueError("The contents of file_tweets_transformed.json are not a dictionary") except FileNotFoundError: - logging.error("The file file_tweets.json was not found.") + logging.error("The file file_tweets_transformed.json was not found.") except json.JSONDecodeError: - logging.error("The file file_tweets.json does not contain valid JSON.") + logging.error("The file file_tweets_transformed.json does not contain valid JSON.") except ValueError as e: logging.error(e) @@ -68,9 +70,9 @@ def forest_create(): # Read the current tweets from the API or other source current_tweets = readTootsApiJson() - # Read additional tweets from file_tweets.json + # Read additional tweets from file_tweets_transformed.json try: - with open('data/file_tweets.json', 'r') as file: + with open(os.path.join(data_folder, 'file_tweets_transformed.json'), 'r', encoding='utf-8') as file: file_tweets_data = json.load(file) # Check if the data is a dictionary if isinstance(file_tweets_data, dict): @@ -83,11 +85,11 @@ def forest_create(): else: raise ValueError("The current tweets are neither a list nor a dictionary.") else: - raise ValueError("The contents of file_tweets.json are not a dictionary") + raise ValueError("The contents of file_tweets_transformed.json are not a dictionary") except FileNotFoundError: - logging.error("The file file_tweets.json was not found.") + logging.error("The file file_tweets_transformed.json was not found.") except json.JSONDecodeError: - logging.error("The file file_tweets.json does not contain valid JSON.") + logging.error("The file file_tweets_transformed.json does not contain valid JSON.") except ValueError as e: logging.error(e) diff --git a/lib/Toot.py b/lib/Toot.py index 3d61439..d5f21ba 100644 --- a/lib/Toot.py +++ b/lib/Toot.py @@ -50,6 +50,12 @@ def hasChildren(self): return len(self.children) > 0 def asApiDict(self, add = {}): + # Determine source platform + source = 'mastodon.social' # Default + if 'platform' in self.data: + if self.data['platform'] == 'bluesky': + source = 'bluesky' + return { 'url': self.getPathOfLinksTo()[0], 'hashtags': self.getHashtags(), @@ -59,7 +65,7 @@ def asApiDict(self, add = {}): 'display_name': self.getUserScreenName(), 'avatar': self.getUserImageHttps(), 'media': self.getMedia(), - 'source': 'mastodon.social', + 'source': source, **add } @@ -81,7 +87,7 @@ def getId(self): return self.data['id'] def getUserId(self): - return self.data['account']['id'] + return self.data['account'].get('id', self.data['account'].get('acct', '')) def getUserName(self): return self.data['account']['username'] @@ -90,15 +96,39 @@ def getUserScreenName(self): return self.data['account']['display_name'] def getUserImageHttps(self): - return self.data['account']['avatar_static'] + return self.data['account'].get('avatar_static', '') def getDateTime(self): - return datetime.strptime(self.data['created_at'], '%Y-%m-%d %H:%M:%S') + date_str = self.data['created_at'] + # Handle different date formats (Mastodon vs Bluesky) + try: + return datetime.strptime(date_str, '%Y-%m-%d %H:%M:%S') + except ValueError: + try: + # ISO format with microseconds (Bluesky) + return datetime.fromisoformat(date_str.replace('Z', '+00:00')) + except ValueError: + # Fallback - return current time if parsing fails + return datetime.now() def getMedia(self): + media = [] + # Handle Mastodon-style media if 'media_attachments' in self.data: - return [item['preview_url'] for item in self.data['media_attachments'] if 'preview_url' in item] - return [] + media.extend([item['preview_url'] for item in self.data['media_attachments'] if 'preview_url' in item]) + # Handle Bluesky media from embed + if 'platform' in self.data and self.data['platform'] == 'bluesky' and 'raw_record' in self.data: + raw_record = self.data['raw_record'] + if 'embed' in raw_record: + # Handle image embeds + if raw_record['embed'].get('$type') == 'app.bsky.embed.images' and 'images' in raw_record['embed']: + for image in raw_record['embed']['images']: + if 'image' in image and '$link' in image['image']['ref']: + # Convert blob reference to CDN URL + blob_ref = image['image']['ref']['$link'] + cdn_url = f"https://cdn.bsky.app/img/feed_thumbnail/plain/did:plc:adsquh2z4vzbpeelyvkq4rbl/{blob_ref}@jpeg" + media.append(cdn_url) + return media def getText(self): if 'content' in self.data: @@ -125,10 +155,21 @@ def isSelfReply(self): def getHashtags(self): hashtags = [] - if self.data['tags']: + # Handle Mastodon-style tags + if 'tags' in self.data and self.data['tags']: hashtags.extend([h['name'] for h in self.data['tags']]) - if 'extended_toot' in self.data and self.data['extended_toot']['entities']['hashtags']: + # Handle extended_toot format + if 'extended_toot' in self.data and 'entities' in self.data['extended_toot'] and 'hashtags' in self.data['extended_toot']['entities']: hashtags.extend([h['text'] for h in self.data['extended_toot']['entities']['hashtags']]) + # Handle Bluesky hashtags from facets + if 'platform' in self.data and self.data['platform'] == 'bluesky' and 'raw_record' in self.data: + raw_record = self.data['raw_record'] + if 'facets' in raw_record: + for facet in raw_record['facets']: + if 'features' in facet: + for feature in facet['features']: + if feature.get('$type') == 'app.bsky.richtext.facet#tag' and 'tag' in feature: + hashtags.append(feature['tag']) return hashtags def hasHashtag(self, hashtag): @@ -136,14 +177,50 @@ def hasHashtag(self, hashtag): def getLinks(self): urls = [] - if 'content' in self.data: + content = self.getText() + + # For Bluesky posts - extract URLs from structured data first + if 'platform' in self.data and self.data['platform'] == 'bluesky' and 'raw_record' in self.data: + raw_record = self.data['raw_record'] + + # Extract from embed.external.uri + if 'embed' in raw_record and 'external' in raw_record['embed'] and 'uri' in raw_record['embed']['external']: + urls.append(raw_record['embed']['external']['uri']) + + # Extract from facets (rich text links) + if 'facets' in raw_record: + for facet in raw_record['facets']: + if 'features' in facet: + for feature in facet['features']: + if feature.get('$type') == 'app.bsky.richtext.facet#link' and 'uri' in feature: + urls.append(feature['uri']) + + # For Mastodon posts with HTML content + if 'content' in self.data and '<' in self.data['content']: extractor = URLExtractor() extractor.feed(self.data['content']) urls.extend(extractor.urls) - return urls + + # Fallback: extract URLs from plain text content using regex + if content: + import re + # Match URLs in plain text (http/https URLs) + url_pattern = r'https?://[^\s<>"]+|[a-zA-Z0-9][-a-zA-Z0-9]*\.(?:[a-zA-Z]{2,}(?:\.[a-zA-Z]{2,})?)[^\s<>"]*' + text_urls = re.findall(url_pattern, content) + urls.extend(text_urls) + + return list(set(urls)) # Remove duplicates def getPathOfLinksTo(self, to = tootsFetchSettings['link']): - return [u.split(to, 1)[1] for u in self.getLinks() if to in u] + links = self.getLinks() + matching_links = [u for u in links if to in u] + + # For Bluesky posts, prioritize full URLs (those starting with https://) over truncated ones + if 'platform' in self.data and self.data['platform'] == 'bluesky': + # Sort so full URLs come first + matching_links.sort(key=lambda x: not x.startswith('https://')) + + return [u.split(to, 1)[1] for u in matching_links] def hasLinkTo(self, to = f"{tootsFetchSettings['link']}/@"): return len(self.getPathOfLinksTo(to)) > 0 diff --git a/lib/TootForest.py b/lib/TootForest.py index b70902a..e931643 100644 --- a/lib/TootForest.py +++ b/lib/TootForest.py @@ -9,7 +9,8 @@ class TootForest(object): @staticmethod def fromFolder(folder = tootsFetchSettings['folder']): toots = Toot.loadFromFolder(folder) - toots = filter(lambda t: t.hasLinkTo(), toots) + # Filter for posts that contain links to the configured domain + toots = filter(lambda t: t.hasLinkTo(tootsFetchSettings['link']), toots) # toots = filter(lambda t: not t.hasHashtag('private'), toots) return TootForest(toots) diff --git a/lib/bluesky_base.py b/lib/bluesky_base.py new file mode 100644 index 0000000..46a2291 --- /dev/null +++ b/lib/bluesky_base.py @@ -0,0 +1,164 @@ +import json +import logging +import time +import threading +from urllib.parse import urlencode +import websocket +from datetime import datetime +from .bluesky_did_resolver import did_resolver + +class BlueSkyStreamListener: + def __init__(self, keywords, callback_func, reconnect_attempts=10, reconnect_delay=5): + self.endpoint = 'wss://jetstream2.us-west.bsky.network/subscribe' + self.wanted_collections = ['app.bsky.feed.post'] + self.keywords = keywords + self.callback_func = callback_func + self.ws = None + self.reconnect_attempts = 0 + self.max_reconnect_attempts = reconnect_attempts + self.reconnect_delay = reconnect_delay + self.should_run = True + + def _build_url(self): + params = [('wantedCollections', collection) for collection in self.wanted_collections] + query_string = urlencode(params) + return f"{self.endpoint}?{query_string}" + + def on_message(self, ws, message): + try: + data = json.loads(message) + self._process_message(data) + except json.JSONDecodeError as e: + logging.error(f"Error parsing Bluesky message: {e}") + except Exception as e: + logging.error(f"Error processing Bluesky message: {e}") + + def _process_message(self, message): + if message.get('kind') == 'commit' and message.get('commit'): + commit = message['commit'] + operation = commit.get('operation', '') + + # Handle deletion events + if operation == 'delete': + self._handle_deletion(message) + return + + # Handle creation/update events (existing logic) + if commit.get('record') and operation == 'create': + record = commit['record'] + + if record.get('text'): + text = record['text'].lower() + matched_keywords = [keyword for keyword in self.keywords if keyword.lower() in text] + + if matched_keywords: + # Extract ID from the message - use rkey which is the record key + post_id = commit.get('rkey', '') + + # Construct the proper AT-URI + at_uri = f"at://{message.get('did', '')}/app.bsky.feed.post/{post_id}" + + # Resolve DID to actual handle + did = message.get('did', '') + resolved_handle = did_resolver.resolve_did_to_handle(did) + + # Extract reply information if this is a reply + in_reply_to_id = None + in_reply_to_account_id = None + if 'reply' in record and record['reply'].get('parent'): + parent_uri = record['reply']['parent'].get('uri', '') + if parent_uri.startswith('at://'): + # Parse AT-URI to extract DID and post ID + # Format: at://did:plc:xxxxx/app.bsky.feed.post/postid + parts = parent_uri.split('/') + if len(parts) >= 4: + parent_did = parts[2] # The DID part + in_reply_to_id = parts[-1] # The post ID part + in_reply_to_account_id = parent_did + + # Convert to a format similar to Mastodon toot + bluesky_post = { + 'id': post_id, + 'content': record['text'], + 'created_at': datetime.fromtimestamp(message.get('time_us', 0) / 1000000).isoformat(), + 'account': { + 'acct': did, + 'username': resolved_handle.replace('@', '') if resolved_handle.startswith('@') else resolved_handle, + 'display_name': resolved_handle + }, + 'uri': at_uri, + 'url': f"https://bsky.app/profile/{message.get('did', '')}/post/{post_id}", + 'platform': 'bluesky', + 'matched_keywords': matched_keywords, + 'raw_record': record + } + + # Add reply fields if this is a reply + if in_reply_to_id: + bluesky_post['in_reply_to_id'] = in_reply_to_id + bluesky_post['in_reply_to_account_id'] = in_reply_to_account_id + + self.callback_func(bluesky_post) + + def _handle_deletion(self, message): + """Handle Bluesky post deletion events - only for posts we've captured""" + commit = message.get('commit', {}) + post_id = commit.get('rkey', '') + + if post_id: + # Only process deletions for posts we actually have stored + # The deletion callback will check if the post exists locally + if hasattr(self, 'deletion_callback') and self.deletion_callback: + self.deletion_callback(post_id) + + def set_deletion_callback(self, callback_func): + """Set a callback function to handle deletions""" + self.deletion_callback = callback_func + + def on_error(self, ws, error): + logging.error(f"Bluesky WebSocket error: {error}") + + def on_close(self, ws, close_status_code, close_msg): + logging.warning("Bluesky connection closed") + if self.should_run: + self._reconnect() + + def on_open(self, ws): + logging.info("Connected to Bluesky Jetstream") + self.reconnect_attempts = 0 + + def _reconnect(self): + if self.reconnect_attempts < self.max_reconnect_attempts and self.should_run: + self.reconnect_attempts += 1 + logging.info(f"Reconnecting to Bluesky in {self.reconnect_delay}s (attempt {self.reconnect_attempts}/{self.max_reconnect_attempts})") + time.sleep(self.reconnect_delay) + self.start_stream() + else: + logging.error("Max Bluesky reconnection attempts reached") + + def start_stream(self): + if not self.should_run: + return + + try: + url = self._build_url() + logging.info(f"Connecting to Bluesky: {url}") + + self.ws = websocket.WebSocketApp( + url, + on_open=self.on_open, + on_message=self.on_message, + on_error=self.on_error, + on_close=self.on_close + ) + + self.ws.run_forever() + except Exception as e: + logging.error(f"Error starting Bluesky stream: {e}") + if self.should_run: + self._reconnect() + + def stop_stream(self): + self.should_run = False + if self.ws: + self.ws.close() \ No newline at end of file diff --git a/lib/bluesky_did_resolver.py b/lib/bluesky_did_resolver.py new file mode 100644 index 0000000..2d69211 --- /dev/null +++ b/lib/bluesky_did_resolver.py @@ -0,0 +1,124 @@ +import requests +import logging +import json +import time +from datetime import datetime, timedelta + +class BlueSkyDIDResolver: + def __init__(self): + self.cache = {} + self.cache_expiry = {} + self.cache_duration = timedelta(hours=24) # Cache for 24 hours + + def resolve_did_to_handle(self, did): + """ + Resolve a DID to a Bluesky handle using AT Protocol + + Args: + did (str): The DID to resolve (e.g., 'did:plc:adsquh2z4vzbpeelyvkq4rbl') + + Returns: + str: The resolved handle (e.g., '@username.bsky.social') or formatted DID if resolution fails + """ + # Check cache first + if self._is_cached(did): + return self.cache[did] + + try: + # Use AT Protocol's com.atproto.identity.resolveHandle API + # But we need to resolve DID to handle, so we use the directory service + + # Try multiple resolution methods + handle = self._resolve_via_plc_directory(did) + if not handle: + handle = self._resolve_via_bsky_api(did) + + if handle: + # Cache the result + self._cache_result(did, handle) + return handle + else: + # Fallback: format the DID nicely + fallback = self._format_did_fallback(did) + self._cache_result(did, fallback) + return fallback + + except Exception as e: + logging.warning(f"Failed to resolve DID {did}: {e}") + fallback = self._format_did_fallback(did) + self._cache_result(did, fallback) + return fallback + + def _resolve_via_plc_directory(self, did): + """Try to resolve via PLC directory""" + try: + # PLC directory API + url = f"https://plc.directory/{did}" + response = requests.get(url, timeout=5) + + if response.status_code == 200: + data = response.json() + # Look for the handle in the service endpoints + if 'alsoKnownAs' in data: + for aka in data['alsoKnownAs']: + if aka.startswith('at://'): + handle = aka.replace('at://', '') + return f"@{handle}" + + except Exception as e: + logging.debug(f"PLC directory resolution failed for {did}: {e}") + + return None + + def _resolve_via_bsky_api(self, did): + """Try to resolve via Bluesky's public API""" + try: + # Use Bluesky's public API to get profile + url = f"https://public.api.bsky.app/xrpc/app.bsky.actor.getProfile" + params = {'actor': did} + response = requests.get(url, params=params, timeout=5) + + if response.status_code == 200: + data = response.json() + if 'handle' in data: + return f"@{data['handle']}" + + except Exception as e: + logging.debug(f"Bluesky API resolution failed for {did}: {e}") + + return None + + def _format_did_fallback(self, did): + """Format DID as a more readable fallback""" + if did.startswith('did:plc:'): + # Take first 8 characters of the identifier + identifier = did.split(':')[-1][:8] + return f"@{identifier}...bsky" + return did + + def _is_cached(self, did): + """Check if DID is in cache and not expired""" + if did not in self.cache: + return False + + if did in self.cache_expiry: + if datetime.now() > self.cache_expiry[did]: + # Cache expired, remove it + del self.cache[did] + del self.cache_expiry[did] + return False + + return True + + def _cache_result(self, did, handle): + """Cache the resolution result""" + self.cache[did] = handle + self.cache_expiry[did] = datetime.now() + self.cache_duration + + def clear_cache(self): + """Clear the entire cache""" + self.cache.clear() + self.cache_expiry.clear() + +# Global instance for reuse +did_resolver = BlueSkyDIDResolver() \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 52cc1d1..f33961f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,43 +1,63 @@ -backcall==0.1.0 -blurhash==1.1.4 -certifi==2020.4.5.1 -chardet==3.0.4 -click==7.1.2 -cssselect==1.1.0 -decorator==4.4.2 +# Core Flask and web framework Flask==1.1.2 Flask-Cors==3.0.8 -GetOldTweets3==0.0.11 -idna==2.9 -ipykernel==5.2.1 -ipython==7.14.0 -ipython-genutils==0.2.0 -itsdangerous==1.1.0 -jedi==0.17.0 -Jinja2==2.11.2 -jupyter-client==6.1.3 -jupyter-core==4.6.3 -lxml==4.5.1 -MarkupSafe==1.1.1 +Werkzeug>=0.15 + +# Mastodon integration Mastodon.py==1.8.1 -oauthlib==3.1.0 -parso==0.7.0 -pexpect==4.8.0 -pickleshare==0.7.5 -prompt-toolkit==3.0.5 -ptyprocess==0.6.0 -Pygments==2.6.1 -pyquery==1.4.1 -PySocks==1.7.1 -python-dateutil==2.8.1 -python-magic==0.4.27 -pyzmq==19.0.1 -requests==2.23.0 -requests-oauthlib==1.3.0 -six==1.14.0 -tornado==6.0.4 -traitlets==4.3.3 +blurhash>=1.1.4 +python-magic>=0.4.27 + +# Twitter integration +GetOldTweets3==0.0.11 tweepy==3.8.0 -urllib3==1.25.9 -wcwidth==0.1.9 -Werkzeug==1.0.1 + +# Bluesky integration +websocket-client==1.6.4 + +# HTTP and networking +requests>=2.23.0 +requests-oauthlib>=0.7.0 +oauthlib>=3.0.0 +urllib3>=1.25.9 +certifi>=2017.4.17 +PySocks>=1.5.7 + +# Web scraping and parsing +lxml>=4.5.1 +pyquery>=1.2.10 +cssselect>=1.1.0 + +# Utilities +python-dateutil>=2.8.1 +click>=7.1.2 +six>=1.14.0 +chardet>=3.0.2 +idna>=2.5 +decorator>=4.4.2 + +# Template engine +Jinja2>=2.10.1 +MarkupSafe>=1.1.1 +itsdangerous>=0.24 + +# Optional: Development and notebook support (install only if needed) +# ipython>=7.14.0 +# ipykernel>=5.2.1 +# jupyter-client>=6.1.3 +# jupyter-core>=4.6.3 +# ipython-genutils>=0.2.0 +# prompt-toolkit>=3.0.5 +# Pygments>=2.6.1 +# backcall>=0.1.0 +# pexpect>=4.8.0 +# pickleshare>=0.7.5 +# ptyprocess>=0.6.0 +# parso>=0.7.0 +# jedi>=0.17.0 +# wcwidth>=0.1.9 +# traitlets>=4.3.3 +# tornado>=6.0.4 + +# Note: pyzmq has compatibility issues with Python 3.13 and is not required for core functionality +# pyzmq==19.0.1 # Commented out due to Python 3.13 compatibility issues \ No newline at end of file diff --git a/runListener.py b/runListener.py index 50ae4bd..925f3f5 100755 --- a/runListener.py +++ b/runListener.py @@ -3,15 +3,44 @@ import os import logging import time +import threading +import argparse from mastodon import StreamListener from lib.shared import base_folder, tootsFetchSettings, toots_folder -from lib.mastodon_auth import get_auth_user +try: + from lib.mastodon_auth import get_auth_user +except Exception as e: + logging.warning(f"Mastodon auth not configured: {e}") + get_auth_user = None from lib.mastodon_base import writeTootToFolder, writeTootToArchive, readTootFromFolder, deleteTootFromFolder from lib.TootForest import TootForest +from lib.bluesky_base import BlueSkyStreamListener +# Parse command line arguments +parser = argparse.ArgumentParser(description='Social Media Listener') +parser.add_argument('--platforms', type=str, default='all', + help='Platforms to listen to: all, mastodon, bluesky, or comma-separated list (e.g., mastodon,bluesky)') +parser.add_argument('--mastodon-only', action='store_true', help='Listen only to Mastodon') +parser.add_argument('--bluesky-only', action='store_true', help='Listen only to Bluesky') +args = parser.parse_args() -mastodon = get_auth_user() +# Determine which platforms to enable +if args.mastodon_only: + enable_mastodon = True + enable_bluesky = False +elif args.bluesky_only: + enable_mastodon = False + enable_bluesky = True +elif args.platforms.lower() == 'all': + enable_mastodon = True + enable_bluesky = True +else: + platforms = [p.strip().lower() for p in args.platforms.split(',')] + enable_mastodon = 'mastodon' in platforms + enable_bluesky = 'bluesky' in platforms + +mastodon = get_auth_user() if get_auth_user and enable_mastodon else None keyword = tootsFetchSettings['listen'] @@ -34,11 +63,73 @@ def update(): global existing_ids forest = TootForest.fromFolder(tootsFetchSettings['folder']) forest.saveApiJson(tootsFetchSettings['file']) - + # Update the global existing_ids variable files = os.listdir(dir_path) existing_ids = [os.path.splitext(file)[0] for file in files] +def handle_bluesky_post(bluesky_post): + """Handle incoming Bluesky posts, converting them to toot format and saving""" + global existing_ids + id = str(bluesky_post['id']) + if keyword in bluesky_post['content']: + logging.info('Listener got new Bluesky post: {}'.format(id)) + + # Check for story threading (same user replying to themselves with libmap link) + if (bluesky_post.get('in_reply_to_id') and + bluesky_post.get('in_reply_to_account_id') and + bluesky_post.get('account', {}).get('acct') == bluesky_post.get('in_reply_to_account_id')): + + # This is a self-reply, check if the parent has a libmap link and is in our collection + parent_id = str(bluesky_post['in_reply_to_id']) + if parent_id in existing_ids: + try: + parent_toot = readTootFromFolder(parent_id, tootsFetchSettings['folder']) + if keyword in parent_toot.get('content', ''): + # Parent has libmap link, this should be part of a story + # Find the root of the story by following the chain + root_id = parent_id + while True: + try: + root_toot = readTootFromFolder(root_id, tootsFetchSettings['folder']) + if (root_toot.get('in_reply_to_id') and + str(root_toot.get('in_reply_to_account_id')) == str(bluesky_post.get('account', {}).get('acct')) and + str(root_toot['in_reply_to_id']) in existing_ids): + root_id = str(root_toot['in_reply_to_id']) + else: + break + except: + break + + # Add story field to the post before saving + bluesky_post['story'] = root_id + logging.info('Added Bluesky story field: {} -> {}'.format(id, root_id)) + except Exception as e: + logging.warning('Error checking Bluesky story threading: {}'.format(e)) + + writeTootToFolder(bluesky_post, tootsFetchSettings['folder']) + # Update existing_ids immediately to include the new post + existing_ids.append(id) + update() + +def handle_bluesky_deletion(post_id): + """Handle Bluesky post deletions - only archive posts we have captured""" + global existing_ids + str_status = str(post_id) + if str_status in existing_ids: + # This is a post we actually captured and stored + logging.info('Archiving Bluesky post (id: {})!'.format(str_status)) + toot = readTootFromFolder(str_status, tootsFetchSettings['folder']) + archive_dir = os.path.join(toots_folder, tootsFetchSettings['folder'], 'archive') + os.makedirs(archive_dir, exist_ok=True) + + writeTootToArchive(toot, tootsFetchSettings['folder']) + deleteTootFromFolder(post_id, tootsFetchSettings['folder']) + # Remove from existing_ids immediately + existing_ids.remove(str_status) + update() + # If post_id not in existing_ids, we silently ignore it (not relevant to us) + class MastodonStreamListener(StreamListener): def __init__(self, mastodon): @@ -57,26 +148,101 @@ def stream_with_reconnection(self): break except Exception as e: self.receivedHeartbeat = False - logging.warning("Error: ", e) + logging.warning(f"Error: {e}") logging.info("Trying to reconnect after " + str(retry_delay) + " seconds.") retry_count += 1 time.sleep(retry_delay) - def on_update(self, toot): + def on_update(self, toot): + global existing_ids if keyword in toot['content']: id = str(toot['id']) # Convert the ID to a string, if it's not already logging.info('Listener got new toot: {}'.format(id)) + + # Check for story threading (same user replying to themselves with libmap link) + if (toot.get('in_reply_to_id') and + toot.get('in_reply_to_account_id') and + toot.get('account', {}).get('id') == toot.get('in_reply_to_account_id')): + + # This is a self-reply, check if the parent has a libmap link and is in our collection + parent_id = str(toot['in_reply_to_id']) + if parent_id in existing_ids: + try: + parent_toot = readTootFromFolder(parent_id, tootsFetchSettings['folder']) + if keyword in parent_toot.get('content', ''): + # Parent has libmap link, this should be part of a story + # Find the root of the story by following the chain + root_id = parent_id + while True: + try: + root_toot = readTootFromFolder(root_id, tootsFetchSettings['folder']) + if (root_toot.get('in_reply_to_id') and + str(root_toot.get('in_reply_to_account_id')) == str(toot.get('account', {}).get('id')) and + str(root_toot['in_reply_to_id']) in existing_ids): + root_id = str(root_toot['in_reply_to_id']) + else: + break + except: + break + + # Add story field to the toot before saving + toot['story'] = root_id + logging.info('Added story field: {} -> {}'.format(id, root_id)) + except Exception as e: + logging.warning('Error checking story threading: {}'.format(e)) + writeTootToFolder(toot, tootsFetchSettings['folder']) + # Update existing_ids immediately to include the new post + if id not in existing_ids: + existing_ids.append(id) update() - def on_status_update(self, toot): + def on_status_update(self, toot): + global existing_ids if keyword in toot['content']: - id = toot['id'] + id = str(toot['id']) logging.info('Listener got update of toot: {}'.format(id)) + + # Check for story threading (same user replying to themselves with libmap link) + if (toot.get('in_reply_to_id') and + toot.get('in_reply_to_account_id') and + toot.get('account', {}).get('id') == toot.get('in_reply_to_account_id')): + + # This is a self-reply, check if the parent has a libmap link and is in our collection + parent_id = str(toot['in_reply_to_id']) + if parent_id in existing_ids: + try: + parent_toot = readTootFromFolder(parent_id, tootsFetchSettings['folder']) + if keyword in parent_toot.get('content', ''): + # Parent has libmap link, this should be part of a story + # Find the root of the story by following the chain + root_id = parent_id + while True: + try: + root_toot = readTootFromFolder(root_id, tootsFetchSettings['folder']) + if (root_toot.get('in_reply_to_id') and + str(root_toot.get('in_reply_to_account_id')) == str(toot.get('account', {}).get('id')) and + str(root_toot['in_reply_to_id']) in existing_ids): + root_id = str(root_toot['in_reply_to_id']) + else: + break + except: + break + + # Add story field to the toot before saving + toot['story'] = root_id + logging.info('Added story field to updated toot: {} -> {}'.format(id, root_id)) + except Exception as e: + logging.warning('Error checking story threading on update: {}'.format(e)) + writeTootToFolder(toot, tootsFetchSettings['folder']) + # Update existing_ids immediately to include the new post + if id not in existing_ids: + existing_ids.append(id) update() def on_delete(self, status_id): + global existing_ids str_status = str(status_id) if str_status in existing_ids: logging.info('Archiving toot (id: {})!'.format(str_status)) @@ -86,6 +252,8 @@ def on_delete(self, status_id): writeTootToArchive(toot, tootsFetchSettings['folder']) deleteTootFromFolder(status_id, tootsFetchSettings['folder']) + # Remove from existing_ids immediately + existing_ids.remove(str_status) update() def handle_heartbeat(self): @@ -100,11 +268,53 @@ def on_abort(self, status_code): logging.info('Init Listener:') logging.info(' - Toots Folder: \'{}\''.format(tootsFetchSettings['folder'])) logging.info(' - Search String: \'{}\''.format(searchString)) +logging.info(' - Enabled Platforms: Mastodon={}, Bluesky={}'.format(enable_mastodon, enable_bluesky)) + +listener = MastodonStreamListener(mastodon) if mastodon and enable_mastodon else None + +# Initialize Bluesky listener if enabled +# Use the same keyword as Mastodon for consistency +keywords = [searchString] # Keep the full search string as one keyword +bluesky_listener = BlueSkyStreamListener(keywords, handle_bluesky_post) if enable_bluesky else None -listener = MastodonStreamListener(mastodon) +# Set up deletion callback for Bluesky +if bluesky_listener: + bluesky_listener.set_deletion_callback(handle_bluesky_deletion) logging.info('Init Toots API File ...') update() -logging.info('Start Listener ...') -listener.stream_with_reconnection() \ No newline at end of file +logging.info('Start Listeners ...') + +# Start Bluesky listener in a separate thread if enabled +bluesky_thread = None +if bluesky_listener: + bluesky_thread = threading.Thread(target=bluesky_listener.start_stream, daemon=True) + bluesky_thread.start() + logging.info('Bluesky listener started in background thread') +else: + logging.info('Bluesky listener disabled') + +# Start Mastodon listener (blocks) if enabled +if listener: + logging.info('Starting Mastodon listener...') + try: + listener.stream_with_reconnection() + except KeyboardInterrupt: + logging.info('Shutting down listeners...') + if bluesky_listener: + bluesky_listener.stop_stream() + raise +else: + logging.info('Mastodon listener disabled') + if bluesky_listener: + logging.info('Running with Bluesky only - press Ctrl+C to stop') + try: + # Keep the main thread alive when only Bluesky is running + while True: + time.sleep(1) + except KeyboardInterrupt: + logging.info('Shutting down Bluesky listener...') + bluesky_listener.stop_stream() + else: + logging.error('No listeners enabled! Use --help for usage information.') \ No newline at end of file From bb970657a12d53b5d115933b213ffd11c5eb79d2 Mon Sep 17 00:00:00 2001 From: sweing Date: Sat, 27 Sep 2025 14:17:00 +0200 Subject: [PATCH 10/12] add ut8 encodings to json imports --- downloadFromApi.py | 2 +- lib/mastodon_auth.py | 2 +- lib/mastodon_base.py | 10 +++++----- lib/shared.py | 2 +- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/downloadFromApi.py b/downloadFromApi.py index c4e7585..f4f8748 100644 --- a/downloadFromApi.py +++ b/downloadFromApi.py @@ -21,7 +21,7 @@ os.makedirs(data_folder, exist_ok=True) # Load tweet IDs from fromAPI.json -with open(os.path.join(data_folder, "fromAPI.json"), "r") as json_file: +with open(os.path.join(data_folder, "fromAPI.json"), "r", encoding='utf-8') as json_file: tweet_data = json.load(json_file) tweet_ids = tweet_data.get("tweets", {}).keys() diff --git a/lib/mastodon_auth.py b/lib/mastodon_auth.py index db7e41f..19266cf 100644 --- a/lib/mastodon_auth.py +++ b/lib/mastodon_auth.py @@ -3,7 +3,7 @@ import json import os -with open(os.path.join(base_folder, 'config.json'), 'r') as f: +with open(os.path.join(base_folder, 'config.json'), 'r', encoding='utf-8') as f: config = json.load(f) def get_auth_user(): diff --git a/lib/mastodon_base.py b/lib/mastodon_base.py index 05aafd9..49b8578 100644 --- a/lib/mastodon_base.py +++ b/lib/mastodon_base.py @@ -15,17 +15,17 @@ def default(self, obj): def readTootFromFolder(id, folder = tootsFetchSettings['folder']): ff = os.path.join(toots_folder, folder) - with open(os.path.join(ff, '{}.json'.format(id)), 'r') as f: + with open(os.path.join(ff, '{}.json'.format(id)), 'r', encoding='utf-8') as f: return json.load(f) def writeTootToFolder(toot, folder = tootsFetchSettings['folder']): ff = os.path.join(toots_folder, folder) - with open(os.path.join(ff, '{}.json'.format(toot['id'])), 'w') as f: + with open(os.path.join(ff, '{}.json'.format(toot['id'])), 'w', encoding='utf-8') as f: json.dump(toot, f, cls=DateTimeEncoder) def writeTootToArchive(toot, folder = tootsFetchSettings['folder']): ff = os.path.join(toots_folder, folder, "archive") - with open(os.path.join(ff, '{}.json'.format(toot['id'])), 'w') as f: + with open(os.path.join(ff, '{}.json'.format(toot['id'])), 'w', encoding='utf-8') as f: json.dump(toot, f, cls=DateTimeEncoder) def deleteTootFromFolder(id, folder = tootsFetchSettings['folder']): @@ -47,9 +47,9 @@ def readTootFromMastodon(id): def writeTootsApiJson(data, tootsApiFile = tootsFetchSettings['file']): filePath = os.path.join(data_folder, tootsApiFile) logging.info('Writing new toots API file: \'{}\''.format(tootsApiFile)) - with open(filePath, 'w') as f: + with open(filePath, 'w', encoding='utf-8') as f: json.dump(data, f) def readTootsApiJson(tootsApiFile = tootsFetchSettings['file']): - with open(os.path.join(data_folder, tootsApiFile), 'r') as f: + with open(os.path.join(data_folder, tootsApiFile), 'r', encoding='utf-8') as f: return json.load(f) diff --git a/lib/shared.py b/lib/shared.py index f26947a..35fb282 100644 --- a/lib/shared.py +++ b/lib/shared.py @@ -7,7 +7,7 @@ data_folder = os.path.join(base_folder, 'data') toots_folder = os.path.join(data_folder, 'toots') -with open(os.path.join(base_folder, 'config.json'), 'r') as f: +with open(os.path.join(base_folder, 'config.json'), 'r', encoding='utf-8') as f: config = json.load(f) tootsFetchSettings = config['tootsFetchSettings']['list'][config['tootsFetchSettings']['default']] From 1c2a31c3b534d72401c0f2a3ad5315acce116d76 Mon Sep 17 00:00:00 2001 From: sweing Date: Fri, 17 Oct 2025 21:36:42 +0200 Subject: [PATCH 11/12] fixing url issue --- lib/Toot.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/lib/Toot.py b/lib/Toot.py index d5f21ba..8bf07c6 100644 --- a/lib/Toot.py +++ b/lib/Toot.py @@ -178,9 +178,11 @@ def hasHashtag(self, hashtag): def getLinks(self): urls = [] content = self.getText() + is_bluesky = 'platform' in self.data and self.data['platform'] == 'bluesky' + has_html_content = 'content' in self.data and '<' in self.data['content'] # For Bluesky posts - extract URLs from structured data first - if 'platform' in self.data and self.data['platform'] == 'bluesky' and 'raw_record' in self.data: + if is_bluesky and 'raw_record' in self.data: raw_record = self.data['raw_record'] # Extract from embed.external.uri @@ -196,13 +198,15 @@ def getLinks(self): urls.append(feature['uri']) # For Mastodon posts with HTML content - if 'content' in self.data and '<' in self.data['content']: + if has_html_content: extractor = URLExtractor() extractor.feed(self.data['content']) urls.extend(extractor.urls) # Fallback: extract URLs from plain text content using regex - if content: + # Skip if we already found URLs from structured data (Bluesky) or HTML (Mastodon) + # This avoids extracting HTML-encoded duplicates or truncated display text + if content and not ((is_bluesky or has_html_content) and len(urls) > 0): import re # Match URLs in plain text (http/https URLs) url_pattern = r'https?://[^\s<>"]+|[a-zA-Z0-9][-a-zA-Z0-9]*\.(?:[a-zA-Z]{2,}(?:\.[a-zA-Z]{2,})?)[^\s<>"]*' From 94884e5d8e993624ac26cac1cfcb4c05a0b69eac Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 5 Jan 2026 11:29:15 +0000 Subject: [PATCH 12/12] Add comprehensive documentation to TootForest class - Added detailed class-level docstring explaining the forest/tree structure concept - Documented all methods with parameter descriptions, return values, and examples - Explained the thread organization algorithm and self-reply logic - Described the ID transformation pipeline used for proper story sorting - Included usage examples for key methods - Clarified the distinction between trunks (top-level posts) and children (replies) This documentation makes the TootForest class's purpose and functionality clear for developers working with social media thread organization and API serialization. --- lib/TootForest.py | 221 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 221 insertions(+) diff --git a/lib/TootForest.py b/lib/TootForest.py index e931643..5b42f26 100644 --- a/lib/TootForest.py +++ b/lib/TootForest.py @@ -4,10 +4,58 @@ from collections import OrderedDict class TootForest(object): + """ + A TootForest represents a collection of social media posts organized into thread structures. + + This class organizes individual posts (toots) into a hierarchical tree structure where: + - 'Trunks' are top-level posts (posts that are not replies to themselves) + - 'Children' are self-replies that form conversation threads + + The forest structure enables: + - Thread visualization (showing posts and their reply chains) + - Story organization (grouping related posts that form a narrative) + - Proper ordering and serialization for API consumption + + A post qualifies as a 'self-reply' when it's a reply to another post by the same author, + which is commonly used to create threaded content or longer narratives that exceed + character limits on social media platforms. + + Attributes: + trunks (list): List of Toot objects representing top-level posts (thread starters). + Each trunk may have children forming a thread. + + Example: + # Load all toots from the default folder and organize into forest structure + forest = TootForest.fromFolder() + + # Save the organized structure as API-ready JSON + forest.saveApiJson() + + # Print a hierarchical text representation + print(forest) + """ trunks = [] @staticmethod def fromFolder(folder = tootsFetchSettings['folder']): + """ + Load toots from a folder and construct a TootForest. + + This factory method reads all toots from the specified folder, filters them + to only include posts containing links to the configured domain, and organizes + them into a forest structure. + + Args: + folder (str, optional): Path to the folder containing toot JSON files. + Defaults to the configured folder from tootsFetchSettings. + + Returns: + TootForest: A new TootForest instance containing the organized toots. + + Note: + Only toots with links matching tootsFetchSettings['link'] are included. + This typically filters for posts linking to a specific domain or path. + """ toots = Toot.loadFromFolder(folder) # Filter for posts that contain links to the configured domain toots = filter(lambda t: t.hasLinkTo(tootsFetchSettings['link']), toots) @@ -15,6 +63,32 @@ def fromFolder(folder = tootsFetchSettings['folder']): return TootForest(toots) def __init__(self, toots): + """ + Initialize a TootForest by organizing toots into thread structures. + + This constructor processes a collection of toots and organizes them into a tree + structure by identifying parent-child relationships through self-replies. The + algorithm works as follows: + + 1. Create a dictionary mapping toot IDs to toot objects for quick lookup + 2. For each toot, check if it's a self-reply (reply to the same author) + 3. If it's a self-reply and the parent exists, add it as a child to the parent + 4. Otherwise, add it as a trunk (top-level post) + + This creates a forest where: + - Each trunk represents the start of a thread + - Children are nested under their parent toots + - Only self-replies are connected (replies to the same author) + + Args: + toots (iterable): An iterable of Toot objects to organize into the forest. + + Example: + >>> toots = [toot1, toot2, toot3] # toot2 is a self-reply to toot1 + >>> forest = TootForest(toots) + >>> len(forest.trunks) # toot1 and toot3 are trunks, toot2 is a child + 2 + """ self.trunks = [] tootsDict = {t.getId(): t for t in toots} @@ -26,6 +100,30 @@ def __init__(self, toots): self.trunks.append(toot) def __str__(self): + """ + Generate a human-readable string representation of the forest. + + Creates a hierarchical text view of all toots in the forest, with indentation + showing the thread structure. Each level of reply is indented with two spaces. + + Returns: + str: A multi-line string showing the forest structure with: + - Username, datetime, and ID + - Hashtags prefixed with '#' + - Links prefixed with '▶' + - Content prefixed with '¶' + - Indentation showing reply depth + + Example output: + user1, 2024-01-15 10:30:00, 123456 + # python, coding + ▶ https://example.com/article + ¶ This is the main post content + + user1, 2024-01-15 10:35:00, 123457 + # python + ¶ This is a reply to the above post + """ def printLeafs(toots, i = 0): a = [] for toot in toots: @@ -37,6 +135,39 @@ def printLeafs(toots, i = 0): return '\n'.join(printLeafs(self.trunks)) def asApiJson(self): + """ + Convert the forest into API-ready JSON format. + + Transforms the forest structure into a dictionary suitable for API responses. + For threaded posts (posts with children), this method: + - Assigns a 'story' ID (the ID of the root post) to link related posts + - Follows the chain of first children to create linear story progression + - Includes all posts with their metadata in API dictionary format + + Returns: + dict: A dictionary mapping toot IDs to their API representations. + Each value contains: + - url: Link to the referenced content + - hashtags: List of hashtags + - timestamp: Post creation time + - content: Post text + - account: Username + - display_name: User's display name + - avatar: User avatar URL + - media: List of media URLs + - source: Platform (mastodon.social, bluesky, etc.) + - story: Story ID (only for threaded posts) + + Note: + For threads, only the FIRST child is followed at each level, creating + a linear narrative. Additional children (branches) are not included. + + Example: + >>> forest = TootForest.fromFolder() + >>> api_data = forest.asApiJson() + >>> print(api_data['123456']['story']) # Story ID for a threaded post + '123456' + """ apiJson = {}; for toot in self.trunks: if toot.hasChildren(): @@ -52,6 +183,42 @@ def asApiJson(self): return apiJson def rename_ids(self, data): + """ + Transform toot IDs into sortable keys for proper story ordering. + + This method prepends sequence numbers to toot IDs to enable proper sorting + of stories and their posts. The transformation allows stories to be sorted + in reverse chronological order while maintaining correct post order within + each story. + + The algorithm: + 1. Calculates the total length of each story (number of posts) + 2. Assigns each post a descending count within its story + 3. Creates composite keys: "story_id.count_tweet_id" + + This ensures that when sorted alphabetically: + - Stories are ordered by their root post ID + - Posts within a story maintain their sequential order + - Later posts in a thread have lower counts (e.g., 3, 2, 1) + + Args: + data (dict): Dictionary mapping toot IDs to their API representations. + Each value may contain a 'story' key for threaded posts. + + Returns: + dict: New dictionary with composite keys in format "story_id.count_tweet_id". + The count starts from the story length and decrements for each post. + + Example: + >>> data = { + ... '100': {'story': '100', 'content': 'First post'}, + ... '101': {'story': '100', 'content': 'Second post'}, + ... '102': {'content': 'Standalone post'} + ... } + >>> renamed = forest.rename_ids(data) + >>> list(renamed.keys()) + ['100.2_100', '100.1_101', '102.1_102'] + """ # Create a dictionary to store the count of each story ID story_counts = {} @@ -83,6 +250,33 @@ def rename_ids(self, data): return result def revert_ids(self, data): + """ + Revert composite keys back to original toot IDs. + + This method reverses the transformation done by rename_ids(), extracting + the original toot ID from composite keys. After sorting with renamed keys, + this restores the original ID structure while preserving the sorted order. + + The transformation: + - Input: "story_id.count_tweet_id" (e.g., "100.2_123") + - Output: "tweet_id" (e.g., "123") + + Args: + data (dict): Dictionary with composite keys in format "story_id.count_tweet_id". + + Returns: + dict: New dictionary with original toot IDs as keys, maintaining the + sorted order from the input dictionary. + + Example: + >>> sorted_data = OrderedDict([ + ... ('100.2_100', {'content': 'First'}), + ... ('100.1_101', {'content': 'Second'}) + ... ]) + >>> reverted = forest.revert_ids(sorted_data) + >>> list(reverted.keys()) + ['100', '101'] # Original IDs restored, sorted order preserved + """ # Iterate through the dictionary and create a new one with reverted keys result = {} for key, values in data.items(): @@ -100,6 +294,33 @@ def revert_ids(self, data): def saveApiJson(self, file = tootsFetchSettings['file']): + """ + Save the forest as sorted, API-ready JSON to a file. + + This method performs a complete transformation pipeline: + 1. Convert forest to API JSON format (asApiJson) + 2. Rename IDs to make them sortable (rename_ids) + 3. Sort in descending order (newest stories first) + 4. Revert IDs to original format (revert_ids) + 5. Write to JSON file + + The sorting ensures: + - Stories appear in reverse chronological order (newest first) + - Posts within each story maintain their sequential order + - The final output has clean toot IDs despite complex sorting logic + + Args: + file (str, optional): Path to the output JSON file. + Defaults to the configured file from tootsFetchSettings. + + Side Effects: + Writes a JSON file containing the sorted toots with their metadata. + + Example: + >>> forest = TootForest.fromFolder() + >>> forest.saveApiJson() # Saves to default location + >>> forest.saveApiJson('/custom/path/toots.json') # Custom location + """ data = self.asApiJson() renamed_data = self.rename_ids(data) # Sort toots based on id in descending order