diff --git a/api/__init__.py b/api/__init__.py index e1b8ada..255f864 100644 --- a/api/__init__.py +++ b/api/__init__.py @@ -1,6 +1,6 @@ from flask import Flask, current_app, render_template, request, session from flask_cors import CORS -from .modules import tweets, log +from .modules import toots, log from .modules.auth import login_exempt from lib.shared import authToken, secretKey from datetime import timedelta @@ -12,7 +12,7 @@ def create_app(test_config = None): cors = CORS(app) - app.register_blueprint(tweets.bp) + app.register_blueprint(toots.bp) app.register_blueprint(log.bp) @app.route('/') diff --git a/api/modules/toots.py b/api/modules/toots.py new file mode 100644 index 0000000..17c897b --- /dev/null +++ b/api/modules/toots.py @@ -0,0 +1,148 @@ +from datetime import datetime +import logging +import json +import os + +from flask import Blueprint, jsonify, Response, render_template, abort +from flask_cors import cross_origin +from .auth import login_exempt + +from lib.mastodon_base import readTootsApiJson +from lib.Toot import Toot +from lib.TootForest import TootForest +from lib.shared import data_folder + +bp = Blueprint('tweets', __name__, url_prefix='/tweets') + +@bp.route('/') +@cross_origin() +@login_exempt +def root(): + # Read the current tweets from the API or other source + current_tweets = readTootsApiJson() + + # Read additional tweets from file_tweets_transformed.json + try: + with open(os.path.join(data_folder, 'file_tweets_transformed.json'), 'r', encoding='utf-8') as file: + file_tweets_data = json.load(file) + # Check if the data is a dictionary + if isinstance(file_tweets_data, dict): + # If the current tweets are also a dictionary, update it + if isinstance(current_tweets, dict): + current_tweets.update(file_tweets_data) + # If the current tweets are a list, convert the dictionary to a list and extend it + elif isinstance(current_tweets, list): + current_tweets.extend(file_tweets_data.values()) + else: + raise ValueError("The current tweets are neither a list nor a dictionary.") + else: + raise ValueError("The contents of file_tweets_transformed.json are not a dictionary") + except FileNotFoundError: + logging.error("The file file_tweets_transformed.json was not found.") + except json.JSONDecodeError: + logging.error("The file file_tweets_transformed.json does not contain valid JSON.") + except ValueError as e: + logging.error(e) + + # Return the combined data as a JSON response + return Response(json.dumps({ + 'date': int(datetime.timestamp(datetime.now())), + 'tweets': current_tweets + }), mimetype='application/json') + + +@bp.route('/') +def tweet(id): + tweet = Toot.loadFromFile(id) + return jsonify(tweet.data) + +@bp.route('/forest') +def forest_show(): + return render_template('forest.html.j2', forest = TootForest.fromFolder()) + #return Response(str(forest), mimetype='text/plain') + +@bp.route('/forest/renew') +def forest_create(): + logging.warning('Manual invocation of creating forest!') + forest = TootForest.fromFolder() + forest.saveApiJson() + + # Read the current tweets from the API or other source + current_tweets = readTootsApiJson() + + # Read additional tweets from file_tweets_transformed.json + try: + with open(os.path.join(data_folder, 'file_tweets_transformed.json'), 'r', encoding='utf-8') as file: + file_tweets_data = json.load(file) + # Check if the data is a dictionary + if isinstance(file_tweets_data, dict): + # If the current tweets are also a dictionary, update it + if isinstance(current_tweets, dict): + current_tweets.update(file_tweets_data) + # If the current tweets are a list, convert the dictionary to a list and extend it + elif isinstance(current_tweets, list): + current_tweets.extend(file_tweets_data.values()) + else: + raise ValueError("The current tweets are neither a list nor a dictionary.") + else: + raise ValueError("The contents of file_tweets_transformed.json are not a dictionary") + except FileNotFoundError: + logging.error("The file file_tweets_transformed.json was not found.") + except json.JSONDecodeError: + logging.error("The file file_tweets_transformed.json does not contain valid JSON.") + except ValueError as e: + logging.error(e) + + # Return the combined data as a JSON response + return Response(json.dumps(current_tweets), mimetype='application/json') + +@bp.route('/add/') +def add(id): + logging.warning('Manual invocation of adding tweet (id: {})!'.format(id)) + tweet = Toot.loadFromMastodon(id) + tweet.save() + # renew forest + forest = TootForest.fromFolder() + forest.saveApiJson() + return jsonify({ + 'message': 'added', + 'tweet': { + 'id': id, + 'data': tweet.data + } + }) + +@bp.route('/delete/') +def delete(id): + logging.warning('Manual invocation of deleting tweet (id: {})!'.format(id)) + tweet = Toot.loadFromFile(id) + tweet.delete() + # renew forest + forest = TootForest.fromFolder() + forest.saveApiJson() + return jsonify({ + 'message': 'deleted', + 'tweet': { + 'id': id, + 'data': tweet.data + } + }) + +@bp.route('/all') +def all(): + tweets = Toot.loadFromFolder() + tweets.sort(key = lambda x: x.getDateTime(), reverse = True) + # [Toot(i) for i in tweets] + return render_template('all.html.j2', tweets = tweets) + +@bp.route('/stories') +def info(): + tweets = readTootsApiJson() + stories = {}; + for id, info in tweets.items(): + if 'story' in info: + storyId = info['story'] + if storyId not in stories: + stories[storyId] = [] + stories[storyId].append(Toot.loadFromFile(id)) + return render_template('stories.html.j2', stories = stories) diff --git a/api/modules/tweets.py b/api/modules/tweets.py deleted file mode 100644 index cddbbfb..0000000 --- a/api/modules/tweets.py +++ /dev/null @@ -1,91 +0,0 @@ -from datetime import datetime -import logging - -from flask import Blueprint, jsonify, Response, render_template, abort -from flask_cors import cross_origin -from .auth import login_exempt - -from lib.tweets_base import readTweetsApiJson -from lib.Tweet import Tweet -from lib.TweetForest import TweetForest - -bp = Blueprint('tweets', __name__, url_prefix='/tweets') - -@bp.route('/') -@cross_origin() -@login_exempt -def root(): - return jsonify({ - 'date': int(datetime.timestamp(datetime.now())), - 'tweets': readTweetsApiJson() - }) - - -@bp.route('/') -def tweet(id): - tweet = Tweet.loadFromFile(id) - return jsonify(tweet.data) - -@bp.route('/forest') -def forest_show(): - return render_template('forest.html.j2', forest = TweetForest.fromFolder()) - #return Response(str(forest), mimetype='text/plain') - -@bp.route('/forest/renew') -def forest_create(): - logging.warning('Manual invocation of creating forest!') - # renew forest - forest = TweetForest.fromFolder() - forest.saveApiJson() - return jsonify(readTweetsApiJson()) - -@bp.route('/add/') -def add(id): - logging.warning('Manual invocation of adding tweet (id: {})!'.format(id)) - tweet = Tweet.loadFromTwitter(id) - tweet.save() - # renew forest - forest = TweetForest.fromFolder() - forest.saveApiJson() - return jsonify({ - 'message': 'added', - 'tweet': { - 'id': id, - 'data': tweet.data - } - }) - -@bp.route('/delete/') -def delete(id): - logging.warning('Manual invocation of deleting tweet (id: {})!'.format(id)) - tweet = Tweet.loadFromFile(id) - tweet.delete() - # renew forest - forest = TweetForest.fromFolder() - forest.saveApiJson() - return jsonify({ - 'message': 'deleted', - 'tweet': { - 'id': id, - 'data': tweet.data - } - }) - -@bp.route('/all') -def all(): - tweets = Tweet.loadFromFolder() - tweets.sort(key = lambda x: x.getDateTime(), reverse = True) - # [Tweet(i) for i in tweets] - return render_template('all.html.j2', tweets = tweets) - -@bp.route('/stories') -def info(): - tweets = readTweetsApiJson() - stories = {}; - for id, info in tweets.items(): - if 'story' in info: - storyId = info['story'] - if storyId not in stories: - stories[storyId] = [] - stories[storyId].append(Tweet.loadFromFile(id)) - return render_template('stories.html.j2', stories = stories) diff --git a/config.template.json b/config.template.json index f54f6a2..6cf829d 100644 --- a/config.template.json +++ b/config.template.json @@ -1,6 +1,16 @@ { "authToken": "xxx", "secretKey": "xxx", + "mastodon": { + "instance_url": "https://mastodon.instance", + "client": { + "key": "xxx", + "secret": "xxx" + }, + "access": { + "token": "xxx" + } + }, "twitter": { "api": { "key": "xxx", diff --git a/downloadFromApi.py b/downloadFromApi.py new file mode 100644 index 0000000..f4f8748 --- /dev/null +++ b/downloadFromApi.py @@ -0,0 +1,43 @@ +import os +import requests +import json +from lib.shared import authToken + +BASE_URL = "https://dev.decarbnow.space" +AUTH_URL = BASE_URL + "/authenticate/" +DOWNLOAD_URL = BASE_URL + "/tweets/" + +# Create a session to maintain authentication +session = requests.Session() + +# Authenticate using the provided token +auth_response = session.get(AUTH_URL + authToken) +if auth_response.status_code != 200: + print(f"Authentication failed with status code: {auth_response.status_code}") + exit() + +# Path to the data folder +data_folder = "data" +os.makedirs(data_folder, exist_ok=True) + +# Load tweet IDs from fromAPI.json +with open(os.path.join(data_folder, "fromAPI.json"), "r", encoding='utf-8') as json_file: + tweet_data = json.load(json_file) + tweet_ids = tweet_data.get("tweets", {}).keys() + +# Download files for each tweet ID +for tweet_id in tweet_ids: + download_url = DOWNLOAD_URL + tweet_id + + # Send request to download the file + file_response = session.get(download_url) + if file_response.status_code == 200: + # Save the file in the data folder + with open(os.path.join(data_folder, "live", f"{tweet_id}.json"), "wb") as file: + file.write(file_response.content) + print(f"Downloaded file for tweet ID {tweet_id}") + else: + print(f"Failed to download file for tweet ID {tweet_id} with status code: {file_response.status_code}") + +# Close the session +session.close() diff --git a/getOldToots.py b/getOldToots.py new file mode 100644 index 0000000..728063f --- /dev/null +++ b/getOldToots.py @@ -0,0 +1,61 @@ +from mastodon import Mastodon +import json +from datetime import datetime +import os +import shutil +from lib.mastodon_auth import get_auth_user + +from lib.shared import base_folder, tootsFetchSettings, toots_folder + +# Define the URL you want to check for in the messages +keyword = tootsFetchSettings['listen'] + '/@' + +# Define the path for the data folder and archive subfolder +data_folder = os.path.join(toots_folder, tootsFetchSettings['folder']) + +archive_folder = os.path.join(data_folder, "archive") + +# Ensure the data and archive folders exist +os.makedirs(data_folder, exist_ok=True) +os.makedirs(archive_folder, exist_ok=True) + +# Get a list of existing message IDs in the data folder +existing_ids = [filename.split(".")[0] for filename in os.listdir(data_folder) if filename.endswith(".json")] + +# Create an instance of Mastodon +mastodon = get_auth_user() + +# Custom JSON encoder to handle datetime objects +class DateTimeEncoder(json.JSONEncoder): + def default(self, obj): + if isinstance(obj, datetime): + return obj.strftime('%Y-%m-%d %H:%M:%S') + return super().default(obj) + +# Search for messages containing the target URL +search_results = mastodon.search(keyword) + +# Check each search result for the target URL +for status in search_results['statuses']: + # Generate a unique filename using ID + message_id = str(status['id']) + filename = f'{message_id}.json' + filepath = os.path.join(data_folder, filename) + + # Save the relevant information to a JSON file + with open(filepath, 'w') as output_file: + json.dump(status, output_file, cls=DateTimeEncoder) + + print(f'Message with ID {message_id} saved to {filepath}') + + # Remove the ID from the existing IDs list if found + if message_id in existing_ids: + existing_ids.remove(message_id) + +# Move any remaining files in the data folder to the archive folder +for message_id in existing_ids: + filename = f'{message_id}.json' + filepath = os.path.join(data_folder, filename) + archive_filepath = os.path.join(archive_folder, filename) + shutil.move(filepath, archive_filepath) + print(f'Message with ID {message_id} moved to archive folder') diff --git a/lib/Toot.py b/lib/Toot.py new file mode 100644 index 0000000..8bf07c6 --- /dev/null +++ b/lib/Toot.py @@ -0,0 +1,234 @@ +from .shared import toots_folder, tootsFetchSettings +#from .mastodon_auth import get_api_app +from .mastodon_base import * + +from datetime import datetime +from html.parser import HTMLParser + +class URLExtractor(HTMLParser): + def __init__(self): + super().__init__() + self.urls = [] + + def handle_starttag(self, tag, attrs): + if tag == 'a': + self.urls.extend(value for name, value in attrs if name == 'href') + +class Toot(object): + data = None + + @staticmethod + def loadFromFile(id, folder = tootsFetchSettings['folder']): + return Toot(readTootFromFolder(id, folder)) + + @staticmethod + def loadFromFolder(folder = tootsFetchSettings['folder']): + return [Toot(j) for j in readTootsFromFolder(folder)] + + @staticmethod + def loadFromMastodon(id): + return Toot(readTootFromMastodon(id)) + + + def __init__(self, data): + self.data = data + self.children = [] + + def save(self, folder = tootsFetchSettings['folder']): + writeTootToFolder(self.data, folder) + + def delete(self, folder = tootsFetchSettings['folder']): + deleteTootFromFolder(self.getId(), folder) + + def addChild(self, toot): + self.children.append(toot) + + def getChildren(self): + return self.children + + def hasChildren(self): + return len(self.children) > 0 + + def asApiDict(self, add = {}): + # Determine source platform + source = 'mastodon.social' # Default + if 'platform' in self.data: + if self.data['platform'] == 'bluesky': + source = 'bluesky' + + return { + 'url': self.getPathOfLinksTo()[0], + 'hashtags': self.getHashtags(), + 'timestamp': str(self.getDateTime()), + 'content': self.getText(), + 'account': self.getUserName(), + 'display_name': self.getUserScreenName(), + 'avatar': self.getUserImageHttps(), + 'media': self.getMedia(), + 'source': source, + **add + } + + def getData(self): + return self.data + + def toStringList(self): + return [ + "{}, {}, {}".format(self.getUserName(), self.getDateTime(), self.getId()), + "# {}".format(', '.join(self.getHashtags())), + "▶ {}".format(', '.join(self.getLinks())), + "¶ {}".format(self.getText()) + ] + + def __str__(self): + return '\n'.join(self.toStringList()) + + def getId(self): + return self.data['id'] + + def getUserId(self): + return self.data['account'].get('id', self.data['account'].get('acct', '')) + + def getUserName(self): + return self.data['account']['username'] + + def getUserScreenName(self): + return self.data['account']['display_name'] + + def getUserImageHttps(self): + return self.data['account'].get('avatar_static', '') + + def getDateTime(self): + date_str = self.data['created_at'] + # Handle different date formats (Mastodon vs Bluesky) + try: + return datetime.strptime(date_str, '%Y-%m-%d %H:%M:%S') + except ValueError: + try: + # ISO format with microseconds (Bluesky) + return datetime.fromisoformat(date_str.replace('Z', '+00:00')) + except ValueError: + # Fallback - return current time if parsing fails + return datetime.now() + + def getMedia(self): + media = [] + # Handle Mastodon-style media + if 'media_attachments' in self.data: + media.extend([item['preview_url'] for item in self.data['media_attachments'] if 'preview_url' in item]) + # Handle Bluesky media from embed + if 'platform' in self.data and self.data['platform'] == 'bluesky' and 'raw_record' in self.data: + raw_record = self.data['raw_record'] + if 'embed' in raw_record: + # Handle image embeds + if raw_record['embed'].get('$type') == 'app.bsky.embed.images' and 'images' in raw_record['embed']: + for image in raw_record['embed']['images']: + if 'image' in image and '$link' in image['image']['ref']: + # Convert blob reference to CDN URL + blob_ref = image['image']['ref']['$link'] + cdn_url = f"https://cdn.bsky.app/img/feed_thumbnail/plain/did:plc:adsquh2z4vzbpeelyvkq4rbl/{blob_ref}@jpeg" + media.append(cdn_url) + return media + + def getText(self): + if 'content' in self.data: + return self.data['content'] + elif 'extended_toot' in self.data: + if 'full_text' in self.data['extended_toot']: + return self.data['extended_toot']['full_text'] + elif 'retooted_status' in self.data and 'extended_toot' in self.data['retooted_status'] and self.data['retooted_status']['extended_toot']['full_text']: + return self.data['retooted_status']['extended_toot']['full_text'] + elif 'text' in self.data: + return self.data['text'] + + def isReply(self): + return 'in_reply_to_id' in self.data + + def getReplyToId(self): + return self.data['in_reply_to_id'] + + def getReplyToUserId(self): + return self.data['in_reply_to_account_id'] + + def isSelfReply(self): + return self.isReply() and self.getUserId() == self.getReplyToUserId() + + def getHashtags(self): + hashtags = [] + # Handle Mastodon-style tags + if 'tags' in self.data and self.data['tags']: + hashtags.extend([h['name'] for h in self.data['tags']]) + # Handle extended_toot format + if 'extended_toot' in self.data and 'entities' in self.data['extended_toot'] and 'hashtags' in self.data['extended_toot']['entities']: + hashtags.extend([h['text'] for h in self.data['extended_toot']['entities']['hashtags']]) + # Handle Bluesky hashtags from facets + if 'platform' in self.data and self.data['platform'] == 'bluesky' and 'raw_record' in self.data: + raw_record = self.data['raw_record'] + if 'facets' in raw_record: + for facet in raw_record['facets']: + if 'features' in facet: + for feature in facet['features']: + if feature.get('$type') == 'app.bsky.richtext.facet#tag' and 'tag' in feature: + hashtags.append(feature['tag']) + return hashtags + + def hasHashtag(self, hashtag): + return hashtag in self.getHashtags() + + def getLinks(self): + urls = [] + content = self.getText() + is_bluesky = 'platform' in self.data and self.data['platform'] == 'bluesky' + has_html_content = 'content' in self.data and '<' in self.data['content'] + + # For Bluesky posts - extract URLs from structured data first + if is_bluesky and 'raw_record' in self.data: + raw_record = self.data['raw_record'] + + # Extract from embed.external.uri + if 'embed' in raw_record and 'external' in raw_record['embed'] and 'uri' in raw_record['embed']['external']: + urls.append(raw_record['embed']['external']['uri']) + + # Extract from facets (rich text links) + if 'facets' in raw_record: + for facet in raw_record['facets']: + if 'features' in facet: + for feature in facet['features']: + if feature.get('$type') == 'app.bsky.richtext.facet#link' and 'uri' in feature: + urls.append(feature['uri']) + + # For Mastodon posts with HTML content + if has_html_content: + extractor = URLExtractor() + extractor.feed(self.data['content']) + urls.extend(extractor.urls) + + # Fallback: extract URLs from plain text content using regex + # Skip if we already found URLs from structured data (Bluesky) or HTML (Mastodon) + # This avoids extracting HTML-encoded duplicates or truncated display text + if content and not ((is_bluesky or has_html_content) and len(urls) > 0): + import re + # Match URLs in plain text (http/https URLs) + url_pattern = r'https?://[^\s<>"]+|[a-zA-Z0-9][-a-zA-Z0-9]*\.(?:[a-zA-Z]{2,}(?:\.[a-zA-Z]{2,})?)[^\s<>"]*' + text_urls = re.findall(url_pattern, content) + urls.extend(text_urls) + + return list(set(urls)) # Remove duplicates + + def getPathOfLinksTo(self, to = tootsFetchSettings['link']): + links = self.getLinks() + matching_links = [u for u in links if to in u] + + # For Bluesky posts, prioritize full URLs (those starting with https://) over truncated ones + if 'platform' in self.data and self.data['platform'] == 'bluesky': + # Sort so full URLs come first + matching_links.sort(key=lambda x: not x.startswith('https://')) + + return [u.split(to, 1)[1] for u in matching_links] + + def hasLinkTo(self, to = f"{tootsFetchSettings['link']}/@"): + return len(self.getPathOfLinksTo(to)) > 0 + + def getUrl(self): + if 'url' in self.data: + return self.data['url'] diff --git a/lib/TootForest.py b/lib/TootForest.py new file mode 100644 index 0000000..5b42f26 --- /dev/null +++ b/lib/TootForest.py @@ -0,0 +1,440 @@ +from .shared import tootsFetchSettings +from .mastodon_base import readTootsFromFolder, writeTootsApiJson +from .Toot import Toot +from collections import OrderedDict + +class TootForest(object): + """ + A TootForest represents a collection of social media posts organized into thread structures. + + This class organizes individual posts (toots) into a hierarchical tree structure where: + - 'Trunks' are top-level posts (posts that are not replies to themselves) + - 'Children' are self-replies that form conversation threads + + The forest structure enables: + - Thread visualization (showing posts and their reply chains) + - Story organization (grouping related posts that form a narrative) + - Proper ordering and serialization for API consumption + + A post qualifies as a 'self-reply' when it's a reply to another post by the same author, + which is commonly used to create threaded content or longer narratives that exceed + character limits on social media platforms. + + Attributes: + trunks (list): List of Toot objects representing top-level posts (thread starters). + Each trunk may have children forming a thread. + + Example: + # Load all toots from the default folder and organize into forest structure + forest = TootForest.fromFolder() + + # Save the organized structure as API-ready JSON + forest.saveApiJson() + + # Print a hierarchical text representation + print(forest) + """ + trunks = [] + + @staticmethod + def fromFolder(folder = tootsFetchSettings['folder']): + """ + Load toots from a folder and construct a TootForest. + + This factory method reads all toots from the specified folder, filters them + to only include posts containing links to the configured domain, and organizes + them into a forest structure. + + Args: + folder (str, optional): Path to the folder containing toot JSON files. + Defaults to the configured folder from tootsFetchSettings. + + Returns: + TootForest: A new TootForest instance containing the organized toots. + + Note: + Only toots with links matching tootsFetchSettings['link'] are included. + This typically filters for posts linking to a specific domain or path. + """ + toots = Toot.loadFromFolder(folder) + # Filter for posts that contain links to the configured domain + toots = filter(lambda t: t.hasLinkTo(tootsFetchSettings['link']), toots) + # toots = filter(lambda t: not t.hasHashtag('private'), toots) + return TootForest(toots) + + def __init__(self, toots): + """ + Initialize a TootForest by organizing toots into thread structures. + + This constructor processes a collection of toots and organizes them into a tree + structure by identifying parent-child relationships through self-replies. The + algorithm works as follows: + + 1. Create a dictionary mapping toot IDs to toot objects for quick lookup + 2. For each toot, check if it's a self-reply (reply to the same author) + 3. If it's a self-reply and the parent exists, add it as a child to the parent + 4. Otherwise, add it as a trunk (top-level post) + + This creates a forest where: + - Each trunk represents the start of a thread + - Children are nested under their parent toots + - Only self-replies are connected (replies to the same author) + + Args: + toots (iterable): An iterable of Toot objects to organize into the forest. + + Example: + >>> toots = [toot1, toot2, toot3] # toot2 is a self-reply to toot1 + >>> forest = TootForest(toots) + >>> len(forest.trunks) # toot1 and toot3 are trunks, toot2 is a child + 2 + """ + self.trunks = [] + tootsDict = {t.getId(): t for t in toots} + + for toot in tootsDict.values(): + if toot.isSelfReply(): + if toot.getReplyToId() in tootsDict: + tootsDict[toot.getReplyToId()].addChild(toot) + continue + self.trunks.append(toot) + + def __str__(self): + """ + Generate a human-readable string representation of the forest. + + Creates a hierarchical text view of all toots in the forest, with indentation + showing the thread structure. Each level of reply is indented with two spaces. + + Returns: + str: A multi-line string showing the forest structure with: + - Username, datetime, and ID + - Hashtags prefixed with '#' + - Links prefixed with '▶' + - Content prefixed with '¶' + - Indentation showing reply depth + + Example output: + user1, 2024-01-15 10:30:00, 123456 + # python, coding + ▶ https://example.com/article + ¶ This is the main post content + + user1, 2024-01-15 10:35:00, 123457 + # python + ¶ This is a reply to the above post + """ + def printLeafs(toots, i = 0): + a = [] + for toot in toots: + a.extend(["{}{}".format(' ' * i, l) for l in toot.toStringList()]) + a.append('') + if toot.hasChildren(): + a.extend(printLeafs(toot.getChildren(), i + 1)) + return a + return '\n'.join(printLeafs(self.trunks)) + + def asApiJson(self): + """ + Convert the forest into API-ready JSON format. + + Transforms the forest structure into a dictionary suitable for API responses. + For threaded posts (posts with children), this method: + - Assigns a 'story' ID (the ID of the root post) to link related posts + - Follows the chain of first children to create linear story progression + - Includes all posts with their metadata in API dictionary format + + Returns: + dict: A dictionary mapping toot IDs to their API representations. + Each value contains: + - url: Link to the referenced content + - hashtags: List of hashtags + - timestamp: Post creation time + - content: Post text + - account: Username + - display_name: User's display name + - avatar: User avatar URL + - media: List of media URLs + - source: Platform (mastodon.social, bluesky, etc.) + - story: Story ID (only for threaded posts) + + Note: + For threads, only the FIRST child is followed at each level, creating + a linear narrative. Additional children (branches) are not included. + + Example: + >>> forest = TootForest.fromFolder() + >>> api_data = forest.asApiJson() + >>> print(api_data['123456']['story']) # Story ID for a threaded post + '123456' + """ + apiJson = {}; + for toot in self.trunks: + if toot.hasChildren(): + story = toot.getId() + apiJson[story] = toot.asApiDict({'story': str(story)}) + while toot.hasChildren(): + # ONLY FIRST CHILD + toot = toot.getChildren()[0] + apiJson[toot.getId()] = toot.asApiDict({'story': str(story)}) + else: + apiJson[toot.getId()] = toot.asApiDict() + + return apiJson + + def rename_ids(self, data): + """ + Transform toot IDs into sortable keys for proper story ordering. + + This method prepends sequence numbers to toot IDs to enable proper sorting + of stories and their posts. The transformation allows stories to be sorted + in reverse chronological order while maintaining correct post order within + each story. + + The algorithm: + 1. Calculates the total length of each story (number of posts) + 2. Assigns each post a descending count within its story + 3. Creates composite keys: "story_id.count_tweet_id" + + This ensures that when sorted alphabetically: + - Stories are ordered by their root post ID + - Posts within a story maintain their sequential order + - Later posts in a thread have lower counts (e.g., 3, 2, 1) + + Args: + data (dict): Dictionary mapping toot IDs to their API representations. + Each value may contain a 'story' key for threaded posts. + + Returns: + dict: New dictionary with composite keys in format "story_id.count_tweet_id". + The count starts from the story length and decrements for each post. + + Example: + >>> data = { + ... '100': {'story': '100', 'content': 'First post'}, + ... '101': {'story': '100', 'content': 'Second post'}, + ... '102': {'content': 'Standalone post'} + ... } + >>> renamed = forest.rename_ids(data) + >>> list(renamed.keys()) + ['100.2_100', '100.1_101', '102.1_102'] + """ + # Create a dictionary to store the count of each story ID + story_counts = {} + + # Create a dictionary to store the length of each story + story_lengths = {} + + # Calculate the length of each story before the loop + for tweet_id, values in data.items(): + # Check if "story" key is present, otherwise use tweet_id as the story ID + story_id = values.get("story", tweet_id) + + # Update the length of the story + story_lengths[story_id] = story_lengths.get(story_id, 0) + 1 + + # Iterate through the original dictionary and create a new one with modified keys + result = {} + for tweet_id, values in data.items(): + # Check if "story" key is present, otherwise use tweet_id as the story ID + story_id = values.get("story", tweet_id) + + # Calculate the count as the difference between the length of the story and the current length + count = story_lengths[story_id] + story_lengths[story_id] = story_lengths[story_id] - 1 + + # Create the new key in the format "story_id.count_tweet_id" + new_key = f"{story_id}.{count}_{tweet_id}" + result[new_key] = values + + return result + + def revert_ids(self, data): + """ + Revert composite keys back to original toot IDs. + + This method reverses the transformation done by rename_ids(), extracting + the original toot ID from composite keys. After sorting with renamed keys, + this restores the original ID structure while preserving the sorted order. + + The transformation: + - Input: "story_id.count_tweet_id" (e.g., "100.2_123") + - Output: "tweet_id" (e.g., "123") + + Args: + data (dict): Dictionary with composite keys in format "story_id.count_tweet_id". + + Returns: + dict: New dictionary with original toot IDs as keys, maintaining the + sorted order from the input dictionary. + + Example: + >>> sorted_data = OrderedDict([ + ... ('100.2_100', {'content': 'First'}), + ... ('100.1_101', {'content': 'Second'}) + ... ]) + >>> reverted = forest.revert_ids(sorted_data) + >>> list(reverted.keys()) + ['100', '101'] # Original IDs restored, sorted order preserved + """ + # Iterate through the dictionary and create a new one with reverted keys + result = {} + for key, values in data.items(): + # Split the key into story_id.count_tweet_id + parts = key.split('_') + + # Extract the tweet_id from the second part + tweet_id = parts[1] + + # Create the new key with the original tweet_id + new_key = tweet_id + result[new_key] = values + + return result + + + def saveApiJson(self, file = tootsFetchSettings['file']): + """ + Save the forest as sorted, API-ready JSON to a file. + + This method performs a complete transformation pipeline: + 1. Convert forest to API JSON format (asApiJson) + 2. Rename IDs to make them sortable (rename_ids) + 3. Sort in descending order (newest stories first) + 4. Revert IDs to original format (revert_ids) + 5. Write to JSON file + + The sorting ensures: + - Stories appear in reverse chronological order (newest first) + - Posts within each story maintain their sequential order + - The final output has clean toot IDs despite complex sorting logic + + Args: + file (str, optional): Path to the output JSON file. + Defaults to the configured file from tootsFetchSettings. + + Side Effects: + Writes a JSON file containing the sorted toots with their metadata. + + Example: + >>> forest = TootForest.fromFolder() + >>> forest.saveApiJson() # Saves to default location + >>> forest.saveApiJson('/custom/path/toots.json') # Custom location + """ + data = self.asApiJson() + renamed_data = self.rename_ids(data) + # Sort toots based on id in descending order + sorted_toots = OrderedDict(sorted(renamed_data.items(), key=lambda x: x[0], reverse=True)) + final_toots = self.revert_ids(sorted_toots) + + writeTootsApiJson(final_toots, file) + + +# from .shared import tweetsFetchSettings +# from .mastodon_base import readTweetsFromFolder, writeTweetsApiJson +# from .Tweet import Tweet +# from collections import OrderedDict + +# class TweetForest(object): +# trunks = [] + +# @staticmethod +# def fromFolder(folder = tweetsFetchSettings['folder']): +# tweets = Tweet.loadFromFolder(folder) +# tweets = filter(lambda t: t.hasLinkTo(), tweets) +# # tweets = filter(lambda t: not t.hasHashtag('private'), tweets) +# return TweetForest(tweets) + +# def __init__(self, tweets): +# self.trunks = [] +# tweetsDict = {t.getId(): t for t in tweets} + +# for tweet in tweetsDict.values(): +# if tweet.isSelfReply(): +# if tweet.getReplyToId() in tweetsDict: +# tweetsDict[tweet.getReplyToId()].addChild(tweet) +# continue + +# self.trunks.append(tweet) + +# def __str__(self): +# def printLeafs(tweets, i = 0): +# a = [] +# for tweet in tweets: +# a.extend(["{}{}".format(' ' * i, l) for l in tweet.toStringList()]) +# a.append('') +# if tweet.hasChildren(): +# a.extend(printLeafs(tweet.getChildren(), i + 1)) +# return a +# return '\n'.join(printLeafs(self.trunks)) + +# def asApiJson(self): +# apiJson = {}; +# for tweet in self.trunks: +# if tweet.hasChildren(): +# story = tweet.getId() +# apiJson[story] = tweet.asApiDict({'story': story}) +# while tweet.hasChildren(): +# # ONLY FIRST CHILD +# tweet = tweet.getChildren()[0] +# apiJson[tweet.getId()] = tweet.asApiDict({'story': story}) +# else: +# apiJson[tweet.getId()] = tweet.asApiDict() + +# return apiJson + +# def rename_ids(self, data): +# # Create a dictionary to store the count of each story ID +# story_counts = {} + +# # Create a dictionary to store the length of each story +# story_lengths = {} + +# # Calculate the length of each story before the loop +# for tweet_id, values in data.items(): +# # Check if "story" key is present, otherwise use tweet_id as the story ID +# story_id = values.get("story", tweet_id) + +# # Update the length of the story +# story_lengths[story_id] = story_lengths.get(story_id, 0) + 1 + +# # Iterate through the original dictionary and create a new one with modified keys +# result = {} +# for tweet_id, values in data.items(): +# # Check if "story" key is present, otherwise use tweet_id as the story ID +# story_id = values.get("story", tweet_id) + +# # Calculate the count as the difference between the length of the story and the current length +# count = story_lengths[story_id] +# story_lengths[story_id] = story_lengths[story_id] - 1 + +# # Create the new key in the format "story_id.count_tweet_id" +# new_key = f"{story_id}.{count}_{tweet_id}" +# result[new_key] = values + +# return result + +# def revert_ids(self, data): +# # Iterate through the dictionary and create a new one with reverted keys +# result = {} +# for key, values in data.items(): +# # Split the key into story_id.count_tweet_id +# parts = key.split('_') + +# # Extract the tweet_id from the second part +# tweet_id = parts[1] + +# # Create the new key with the original tweet_id +# new_key = tweet_id +# result[new_key] = values + +# return result + +# def saveApiJson(self, file = tweetsFetchSettings['file']): +# data = self.asApiJson() +# renamed_data = self.rename_ids(data) +# # Sort tweets based on id in descending order +# sorted_tweets = OrderedDict(sorted(renamed_data.items(), key=lambda x: x[0], reverse=True)) +# final_tweets = self.revert_ids(sorted_tweets) + +# writeTweetsApiJson(final_tweets, file) \ No newline at end of file diff --git a/lib/Tweet.py b/lib/Tweet.py deleted file mode 100644 index 60e550c..0000000 --- a/lib/Tweet.py +++ /dev/null @@ -1,146 +0,0 @@ -from .shared import tweets_folder, tweetsFetchSettings -from .twitter_auth import get_api_app -from .tweets_base import * - -from datetime import datetime - -class Tweet(object): - data = None - - @staticmethod - def loadFromFile(id, folder = tweetsFetchSettings['folder']): - return Tweet(readTweetFromFolder(id, folder)) - - @staticmethod - def loadFromFolder(folder = tweetsFetchSettings['folder']): - return [Tweet(j) for j in readTweetsFromFolder(folder)] - - @staticmethod - def loadFromTwitter(id): - return Tweet(readTweetFromTwitter(id)) - - - def __init__(self, data): - self.data = data - self.children = [] - - def save(self, folder = tweetsFetchSettings['folder']): - writeTweetToFolder(self.data, folder) - - def delete(self, folder = tweetsFetchSettings['folder']): - deleteTweetFromFolder(self.getId(), folder) - - def addChild(self, tweet): - self.children.append(tweet) - - def getChildren(self): - return self.children - - def hasChildren(self): - return len(self.children) > 0 - - def asApiDict(self, add = {}): - return { - 'url': self.getPathOfLinksTo()[0], - 'hashtags': self.getHashtags(), - 'timestamp': str(self.getDateTime()), - 'content': self.getText(), - 'account': self.getUserName(), - 'display_name': self.getUserScreenName(), - 'avatar': self.getUserImageHttps(), - 'media': self.getMedia(), - **add - } - - def getData(self): - return self.data - - def toStringList(self): - return [ - "{}, {}, {}".format(self.getUserName(), self.getDateTime(), self.getId()), - "# {}".format(', '.join(self.getHashtags())), - "▶ {}".format(', '.join(self.getLinks())), - "¶ {}".format(self.getText()) - ] - - def __str__(self): - return '\n'.join(self.toStringList()) - - def getId(self): - return self.data['id_str'] - - def getUserId(self): - return self.data['user']['id_str'] - - def getUserName(self): - return self.data['user']['name'] - - def getUserScreenName(self): - return self.data['user']['screen_name'] - - def getUserImageHttps(self): - return self.data['user']['profile_image_url_https'] - - def getDateTime(self): - return datetime.strptime(self.data['created_at'], '%a %b %d %H:%M:%S +0000 %Y') - - def getMedia(self): - if 'extended_tweet' in self.data and 'extended_entities' in self.data['extended_tweet'] and 'media' in self.data['extended_tweet']['extended_entities']: - return([item['media_url_https'] for item in self.data['extended_tweet']['extended_entities']['media']]) - elif 'extended_entities' in self.data and 'media' in self.data['extended_entities']: - return([item['media_url_https'] for item in self.data['extended_entities']['media']]) - elif 'quoted_status' in self.data and 'extended_entities' in self.data['quoted_status'] and 'media' in self.data['quoted_status']['extended_entities']: - return([item['media_url_https'] for item in self.data['quoted_status']['extended_entities']['media']]) - elif 'quoted_status' in self.data and 'extended_tweet' in self.data['quoted_status'] and 'extended_entities' in self.data['quoted_status']['extended_tweet'] and 'media' in self.data['quoted_status']['extended_tweet']['extended_entities']: - return([item['media_url_https'] for item in self.data['quoted_status']['extended_tweet']['extended_entities']['media']]) - - def getText(self): - if 'full_text' in self.data: - return self.data['full_text'] - elif 'extended_tweet' in self.data: - if 'full_text' in self.data['extended_tweet']: - return self.data['extended_tweet']['full_text'] - elif 'retweeted_status' in self.data and 'extended_tweet' in self.data['retweeted_status'] and self.data['retweeted_status']['extended_tweet']['full_text']: - return self.data['retweeted_status']['extended_tweet']['full_text'] - elif 'text' in self.data: - return self.data['text'] - - def isReply(self): - return 'in_reply_to_status_id_str' in self.data - - def getReplyToId(self): - return self.data['in_reply_to_status_id_str'] - - def getReplyToUserId(self): - return self.data['in_reply_to_user_id_str'] - - def isSelfReply(self): - return self.isReply() and self.getUserId() == self.getReplyToUserId() - - def getHashtags(self): - hashtags = [] - if self.data['entities']['hashtags']: - hashtags.extend([h['text'] for h in self.data['entities']['hashtags']]) - if 'extended_tweet' in self.data and self.data['extended_tweet']['entities']['hashtags']: - hashtags.extend([h['text'] for h in self.data['extended_tweet']['entities']['hashtags']]) - return hashtags - - def hasHashtag(self, hashtag): - return hashtag in self.getHashtags() - - def getLinks(self): - urls = [] - if self.data['entities']['urls']: - urls.extend([url['expanded_url'] for url in self.data['entities']['urls']]) - if 'extended_tweet' in self.data and self.data['extended_tweet']['entities']['urls']: - urls.extend([url['expanded_url'] for url in self.data['extended_tweet']['entities']['urls']]) - return urls - - def getPathOfLinksTo(self, to = tweetsFetchSettings['link']): - return [u.split(to, 1)[1] for u in self.getLinks() if to in u] - - def hasLinkTo(self, to = f"{tweetsFetchSettings['link']}/@"): - return len(self.getPathOfLinksTo(to)) > 0 - - def getUrl(self): - return 'https://twitter.com/{}/status/{}'.format(self.getUserScreenName(), self.getId()) diff --git a/lib/TweetForest.py b/lib/TweetForest.py deleted file mode 100644 index 1b3e37a..0000000 --- a/lib/TweetForest.py +++ /dev/null @@ -1,108 +0,0 @@ -from .shared import tweetsFetchSettings -from .tweets_base import readTweetsFromFolder, writeTweetsApiJson -from .Tweet import Tweet -from collections import OrderedDict - -class TweetForest(object): - trunks = [] - - @staticmethod - def fromFolder(folder = tweetsFetchSettings['folder']): - tweets = Tweet.loadFromFolder(folder) - tweets = filter(lambda t: t.hasLinkTo(), tweets) - # tweets = filter(lambda t: not t.hasHashtag('private'), tweets) - return TweetForest(tweets) - - def __init__(self, tweets): - self.trunks = [] - tweetsDict = {t.getId(): t for t in tweets} - - for tweet in tweetsDict.values(): - if tweet.isSelfReply(): - if tweet.getReplyToId() in tweetsDict: - tweetsDict[tweet.getReplyToId()].addChild(tweet) - continue - - self.trunks.append(tweet) - - def __str__(self): - def printLeafs(tweets, i = 0): - a = [] - for tweet in tweets: - a.extend(["{}{}".format(' ' * i, l) for l in tweet.toStringList()]) - a.append('') - if tweet.hasChildren(): - a.extend(printLeafs(tweet.getChildren(), i + 1)) - return a - return '\n'.join(printLeafs(self.trunks)) - - def asApiJson(self): - apiJson = {}; - for tweet in self.trunks: - if tweet.hasChildren(): - story = tweet.getId() - apiJson[story] = tweet.asApiDict({'story': story}) - while tweet.hasChildren(): - # ONLY FIRST CHILD - tweet = tweet.getChildren()[0] - apiJson[tweet.getId()] = tweet.asApiDict({'story': story}) - else: - apiJson[tweet.getId()] = tweet.asApiDict() - - return apiJson - - def rename_ids(self, data): - # Create a dictionary to store the count of each story ID - story_counts = {} - - # Create a dictionary to store the length of each story - story_lengths = {} - - # Calculate the length of each story before the loop - for tweet_id, values in data.items(): - # Check if "story" key is present, otherwise use tweet_id as the story ID - story_id = values.get("story", tweet_id) - - # Update the length of the story - story_lengths[story_id] = story_lengths.get(story_id, 0) + 1 - - # Iterate through the original dictionary and create a new one with modified keys - result = {} - for tweet_id, values in data.items(): - # Check if "story" key is present, otherwise use tweet_id as the story ID - story_id = values.get("story", tweet_id) - - # Calculate the count as the difference between the length of the story and the current length - count = story_lengths[story_id] - story_lengths[story_id] = story_lengths[story_id] - 1 - - # Create the new key in the format "story_id.count_tweet_id" - new_key = f"{story_id}.{count}_{tweet_id}" - result[new_key] = values - - return result - - def revert_ids(self, data): - # Iterate through the dictionary and create a new one with reverted keys - result = {} - for key, values in data.items(): - # Split the key into story_id.count_tweet_id - parts = key.split('_') - - # Extract the tweet_id from the second part - tweet_id = parts[1] - - # Create the new key with the original tweet_id - new_key = tweet_id - result[new_key] = values - - return result - - def saveApiJson(self, file = tweetsFetchSettings['file']): - data = self.asApiJson() - renamed_data = self.rename_ids(data) - # Sort tweets based on id in descending order - sorted_tweets = OrderedDict(sorted(renamed_data.items(), key=lambda x: x[0], reverse=True)) - final_tweets = self.revert_ids(sorted_tweets) - - writeTweetsApiJson(final_tweets, file) diff --git a/lib/bluesky_base.py b/lib/bluesky_base.py new file mode 100644 index 0000000..46a2291 --- /dev/null +++ b/lib/bluesky_base.py @@ -0,0 +1,164 @@ +import json +import logging +import time +import threading +from urllib.parse import urlencode +import websocket +from datetime import datetime +from .bluesky_did_resolver import did_resolver + +class BlueSkyStreamListener: + def __init__(self, keywords, callback_func, reconnect_attempts=10, reconnect_delay=5): + self.endpoint = 'wss://jetstream2.us-west.bsky.network/subscribe' + self.wanted_collections = ['app.bsky.feed.post'] + self.keywords = keywords + self.callback_func = callback_func + self.ws = None + self.reconnect_attempts = 0 + self.max_reconnect_attempts = reconnect_attempts + self.reconnect_delay = reconnect_delay + self.should_run = True + + def _build_url(self): + params = [('wantedCollections', collection) for collection in self.wanted_collections] + query_string = urlencode(params) + return f"{self.endpoint}?{query_string}" + + def on_message(self, ws, message): + try: + data = json.loads(message) + self._process_message(data) + except json.JSONDecodeError as e: + logging.error(f"Error parsing Bluesky message: {e}") + except Exception as e: + logging.error(f"Error processing Bluesky message: {e}") + + def _process_message(self, message): + if message.get('kind') == 'commit' and message.get('commit'): + commit = message['commit'] + operation = commit.get('operation', '') + + # Handle deletion events + if operation == 'delete': + self._handle_deletion(message) + return + + # Handle creation/update events (existing logic) + if commit.get('record') and operation == 'create': + record = commit['record'] + + if record.get('text'): + text = record['text'].lower() + matched_keywords = [keyword for keyword in self.keywords if keyword.lower() in text] + + if matched_keywords: + # Extract ID from the message - use rkey which is the record key + post_id = commit.get('rkey', '') + + # Construct the proper AT-URI + at_uri = f"at://{message.get('did', '')}/app.bsky.feed.post/{post_id}" + + # Resolve DID to actual handle + did = message.get('did', '') + resolved_handle = did_resolver.resolve_did_to_handle(did) + + # Extract reply information if this is a reply + in_reply_to_id = None + in_reply_to_account_id = None + if 'reply' in record and record['reply'].get('parent'): + parent_uri = record['reply']['parent'].get('uri', '') + if parent_uri.startswith('at://'): + # Parse AT-URI to extract DID and post ID + # Format: at://did:plc:xxxxx/app.bsky.feed.post/postid + parts = parent_uri.split('/') + if len(parts) >= 4: + parent_did = parts[2] # The DID part + in_reply_to_id = parts[-1] # The post ID part + in_reply_to_account_id = parent_did + + # Convert to a format similar to Mastodon toot + bluesky_post = { + 'id': post_id, + 'content': record['text'], + 'created_at': datetime.fromtimestamp(message.get('time_us', 0) / 1000000).isoformat(), + 'account': { + 'acct': did, + 'username': resolved_handle.replace('@', '') if resolved_handle.startswith('@') else resolved_handle, + 'display_name': resolved_handle + }, + 'uri': at_uri, + 'url': f"https://bsky.app/profile/{message.get('did', '')}/post/{post_id}", + 'platform': 'bluesky', + 'matched_keywords': matched_keywords, + 'raw_record': record + } + + # Add reply fields if this is a reply + if in_reply_to_id: + bluesky_post['in_reply_to_id'] = in_reply_to_id + bluesky_post['in_reply_to_account_id'] = in_reply_to_account_id + + self.callback_func(bluesky_post) + + def _handle_deletion(self, message): + """Handle Bluesky post deletion events - only for posts we've captured""" + commit = message.get('commit', {}) + post_id = commit.get('rkey', '') + + if post_id: + # Only process deletions for posts we actually have stored + # The deletion callback will check if the post exists locally + if hasattr(self, 'deletion_callback') and self.deletion_callback: + self.deletion_callback(post_id) + + def set_deletion_callback(self, callback_func): + """Set a callback function to handle deletions""" + self.deletion_callback = callback_func + + def on_error(self, ws, error): + logging.error(f"Bluesky WebSocket error: {error}") + + def on_close(self, ws, close_status_code, close_msg): + logging.warning("Bluesky connection closed") + if self.should_run: + self._reconnect() + + def on_open(self, ws): + logging.info("Connected to Bluesky Jetstream") + self.reconnect_attempts = 0 + + def _reconnect(self): + if self.reconnect_attempts < self.max_reconnect_attempts and self.should_run: + self.reconnect_attempts += 1 + logging.info(f"Reconnecting to Bluesky in {self.reconnect_delay}s (attempt {self.reconnect_attempts}/{self.max_reconnect_attempts})") + time.sleep(self.reconnect_delay) + self.start_stream() + else: + logging.error("Max Bluesky reconnection attempts reached") + + def start_stream(self): + if not self.should_run: + return + + try: + url = self._build_url() + logging.info(f"Connecting to Bluesky: {url}") + + self.ws = websocket.WebSocketApp( + url, + on_open=self.on_open, + on_message=self.on_message, + on_error=self.on_error, + on_close=self.on_close + ) + + self.ws.run_forever() + except Exception as e: + logging.error(f"Error starting Bluesky stream: {e}") + if self.should_run: + self._reconnect() + + def stop_stream(self): + self.should_run = False + if self.ws: + self.ws.close() \ No newline at end of file diff --git a/lib/bluesky_did_resolver.py b/lib/bluesky_did_resolver.py new file mode 100644 index 0000000..2d69211 --- /dev/null +++ b/lib/bluesky_did_resolver.py @@ -0,0 +1,124 @@ +import requests +import logging +import json +import time +from datetime import datetime, timedelta + +class BlueSkyDIDResolver: + def __init__(self): + self.cache = {} + self.cache_expiry = {} + self.cache_duration = timedelta(hours=24) # Cache for 24 hours + + def resolve_did_to_handle(self, did): + """ + Resolve a DID to a Bluesky handle using AT Protocol + + Args: + did (str): The DID to resolve (e.g., 'did:plc:adsquh2z4vzbpeelyvkq4rbl') + + Returns: + str: The resolved handle (e.g., '@username.bsky.social') or formatted DID if resolution fails + """ + # Check cache first + if self._is_cached(did): + return self.cache[did] + + try: + # Use AT Protocol's com.atproto.identity.resolveHandle API + # But we need to resolve DID to handle, so we use the directory service + + # Try multiple resolution methods + handle = self._resolve_via_plc_directory(did) + if not handle: + handle = self._resolve_via_bsky_api(did) + + if handle: + # Cache the result + self._cache_result(did, handle) + return handle + else: + # Fallback: format the DID nicely + fallback = self._format_did_fallback(did) + self._cache_result(did, fallback) + return fallback + + except Exception as e: + logging.warning(f"Failed to resolve DID {did}: {e}") + fallback = self._format_did_fallback(did) + self._cache_result(did, fallback) + return fallback + + def _resolve_via_plc_directory(self, did): + """Try to resolve via PLC directory""" + try: + # PLC directory API + url = f"https://plc.directory/{did}" + response = requests.get(url, timeout=5) + + if response.status_code == 200: + data = response.json() + # Look for the handle in the service endpoints + if 'alsoKnownAs' in data: + for aka in data['alsoKnownAs']: + if aka.startswith('at://'): + handle = aka.replace('at://', '') + return f"@{handle}" + + except Exception as e: + logging.debug(f"PLC directory resolution failed for {did}: {e}") + + return None + + def _resolve_via_bsky_api(self, did): + """Try to resolve via Bluesky's public API""" + try: + # Use Bluesky's public API to get profile + url = f"https://public.api.bsky.app/xrpc/app.bsky.actor.getProfile" + params = {'actor': did} + response = requests.get(url, params=params, timeout=5) + + if response.status_code == 200: + data = response.json() + if 'handle' in data: + return f"@{data['handle']}" + + except Exception as e: + logging.debug(f"Bluesky API resolution failed for {did}: {e}") + + return None + + def _format_did_fallback(self, did): + """Format DID as a more readable fallback""" + if did.startswith('did:plc:'): + # Take first 8 characters of the identifier + identifier = did.split(':')[-1][:8] + return f"@{identifier}...bsky" + return did + + def _is_cached(self, did): + """Check if DID is in cache and not expired""" + if did not in self.cache: + return False + + if did in self.cache_expiry: + if datetime.now() > self.cache_expiry[did]: + # Cache expired, remove it + del self.cache[did] + del self.cache_expiry[did] + return False + + return True + + def _cache_result(self, did, handle): + """Cache the resolution result""" + self.cache[did] = handle + self.cache_expiry[did] = datetime.now() + self.cache_duration + + def clear_cache(self): + """Clear the entire cache""" + self.cache.clear() + self.cache_expiry.clear() + +# Global instance for reuse +did_resolver = BlueSkyDIDResolver() \ No newline at end of file diff --git a/lib/mastodon_auth.py b/lib/mastodon_auth.py new file mode 100644 index 0000000..19266cf --- /dev/null +++ b/lib/mastodon_auth.py @@ -0,0 +1,15 @@ +from mastodon import Mastodon +from .shared import base_folder, mastodonAuth +import json +import os + +with open(os.path.join(base_folder, 'config.json'), 'r', encoding='utf-8') as f: + config = json.load(f) + +def get_auth_user(): + return Mastodon( + client_id = config['mastodon']['client']['key'], + client_secret = config['mastodon']['client']['secret'], + access_token = config['mastodon']['access']['token'], + api_base_url = config['mastodon']['instance_url'] + ) diff --git a/lib/mastodon_base.py b/lib/mastodon_base.py new file mode 100644 index 0000000..49b8578 --- /dev/null +++ b/lib/mastodon_base.py @@ -0,0 +1,55 @@ +import os +import logging +import json +from datetime import datetime + +from .shared import toots_folder, data_folder, tootsFetchSettings +from .mastodon_auth import get_auth_user + +class DateTimeEncoder(json.JSONEncoder): + def default(self, obj): + if isinstance(obj, datetime): + return obj.strftime('%Y-%m-%d %H:%M:%S') + return super().default(obj) + + +def readTootFromFolder(id, folder = tootsFetchSettings['folder']): + ff = os.path.join(toots_folder, folder) + with open(os.path.join(ff, '{}.json'.format(id)), 'r', encoding='utf-8') as f: + return json.load(f) + +def writeTootToFolder(toot, folder = tootsFetchSettings['folder']): + ff = os.path.join(toots_folder, folder) + with open(os.path.join(ff, '{}.json'.format(toot['id'])), 'w', encoding='utf-8') as f: + json.dump(toot, f, cls=DateTimeEncoder) + +def writeTootToArchive(toot, folder = tootsFetchSettings['folder']): + ff = os.path.join(toots_folder, folder, "archive") + with open(os.path.join(ff, '{}.json'.format(toot['id'])), 'w', encoding='utf-8') as f: + json.dump(toot, f, cls=DateTimeEncoder) + +def deleteTootFromFolder(id, folder = tootsFetchSettings['folder']): + ff = os.path.join(toots_folder, folder) + os.remove(os.path.join(ff, '{}.json'.format(id))) + +def readTootsFromFolder(folder = tootsFetchSettings['folder']): + ff = os.path.join(toots_folder, folder) + toots = [readTootFromFolder(f[:-5], folder) for f in os.listdir(ff) if f[-5:] == '.json'] + logging.info('Read {} toots from folder: \'{}\''.format(len(toots), folder)) + return toots + + +def readTootFromMastodon(id): + mastodon = get_auth_user() + logging.info('Reading toot info from Mastodon API: {}'.format(id)) + return mastodon.status(id) + +def writeTootsApiJson(data, tootsApiFile = tootsFetchSettings['file']): + filePath = os.path.join(data_folder, tootsApiFile) + logging.info('Writing new toots API file: \'{}\''.format(tootsApiFile)) + with open(filePath, 'w', encoding='utf-8') as f: + json.dump(data, f) + +def readTootsApiJson(tootsApiFile = tootsFetchSettings['file']): + with open(os.path.join(data_folder, tootsApiFile), 'r', encoding='utf-8') as f: + return json.load(f) diff --git a/lib/shared.py b/lib/shared.py index d02db0c..35fb282 100644 --- a/lib/shared.py +++ b/lib/shared.py @@ -5,13 +5,13 @@ base_folder = os.path.join(lib_folder, '..') data_folder = os.path.join(base_folder, 'data') -tweets_folder = os.path.join(data_folder, 'tweets') +toots_folder = os.path.join(data_folder, 'toots') -with open(os.path.join(base_folder, 'config.json'), 'r') as f: +with open(os.path.join(base_folder, 'config.json'), 'r', encoding='utf-8') as f: config = json.load(f) -tweetsFetchSettings = config['tweetsFetchSettings']['list'][config['tweetsFetchSettings']['default']] +tootsFetchSettings = config['tootsFetchSettings']['list'][config['tootsFetchSettings']['default']] authToken = config['authToken'] secretKey = config['secretKey'] -twitterAuth = config['twitter'] +mastodonAuth = config['mastodon'] diff --git a/lib/tweets_aux.py b/lib/toots_aux.py similarity index 70% rename from lib/tweets_aux.py rename to lib/toots_aux.py index e128f13..dd4e27c 100644 --- a/lib/tweets_aux.py +++ b/lib/toots_aux.py @@ -1,12 +1,11 @@ -import json import os import logging import GetOldTweets3 as got -from .shared import tweets_folder, tweetsFetchSettings -from .tweets_base import * +from .shared import toots_folder, tootsFetchSettings +from .mastodon_base import * -def getOldTweetIds(searchString = tweetsFetchSettings['link'], max = 100): +def getOldTweetIds(searchString = tootsFetchSettings['link'], max = 100): tweetCriteria = got.manager.TweetCriteria().setQuerySearch(searchString)\ .setMaxTweets(max) logging.info('Loading old tweet ids from twitter (GoT):') @@ -21,8 +20,8 @@ def getOldTweetIds(searchString = tweetsFetchSettings['link'], max = 100): return [t.id for t in tweets] -def populateTweetsFolder(folder = tweetsFetchSettings['folder'], ids = [], init = False, refresh = False): - d = os.path.join(tweets_folder, folder) +def populateTweetsFolder(folder = tootsFetchSettings['folder'], ids = [], init = False, refresh = False): + d = os.path.join(toots_folder, folder) if not os.path.exists(d): os.mkdir(d) ids_e = [f[:-5] for f in os.listdir(d)] @@ -37,5 +36,5 @@ def populateTweetsFolder(folder = tweetsFetchSettings['folder'], ids = [], init ids_f = list(set(ids) - set(ids_e)) for id in ids_f: - info = readTweetFromTwitter(id) - writeTweetToFolder(info, folder) + info = readTootFromMastodon(id) + writeTootToFolder(info, folder) diff --git a/lib/tweets_base.py b/lib/tweets_base.py deleted file mode 100644 index da29c50..0000000 --- a/lib/tweets_base.py +++ /dev/null @@ -1,43 +0,0 @@ -import os -import logging -import json - -from .shared import tweets_folder, data_folder, tweetsFetchSettings -from .twitter_auth import get_api_app - -def readTweetFromFolder(id, folder = tweetsFetchSettings['folder']): - ff = os.path.join(tweets_folder, folder) - with open(os.path.join(ff, '{}.json'.format(id)), 'r') as f: - return json.load(f) - -def writeTweetToFolder(tweet, folder = tweetsFetchSettings['folder']): - ff = os.path.join(tweets_folder, folder) - with open(os.path.join(ff, '{}.json'.format(tweet['id_str'])), 'w') as f: - json.dump(tweet, f) - -def deleteTweetFromFolder(id, folder = tweetsFetchSettings['folder']): - ff = os.path.join(tweets_folder, folder) - os.remove(os.path.join(ff, '{}.json'.format(id))) - -def readTweetsFromFolder(folder = tweetsFetchSettings['folder']): - ff = os.path.join(tweets_folder, folder) - tweets = [readTweetFromFolder(f[:-5], folder) for f in os.listdir(ff) if f[-5:] == '.json'] - logging.info('Read {} tweets from folder: \'{}\''.format(len(tweets), folder)) - return tweets - - -def readTweetFromTwitter(id): - api = get_api_app() - logging.info('Reading tweet info from twitter API: {}'.format(id)) - return api.get_status(id, include_entities=True, tweet_mode='extended')._json - - -def writeTweetsApiJson(data, tweetsApiFile = tweetsFetchSettings['file']): - filePath = os.path.join(data_folder, tweetsApiFile) - logging.info('Writing new tweets API file: \'{}\''.format(tweetsApiFile)) - with open(filePath, 'w') as f: - json.dump(data, f) - -def readTweetsApiJson(tweetsApiFile = tweetsFetchSettings['file']): - with open(os.path.join(data_folder, tweetsApiFile), 'r') as f: - return json.load(f) diff --git a/lib/twitter_auth.py b/lib/twitter_auth.py deleted file mode 100644 index c6f965b..0000000 --- a/lib/twitter_auth.py +++ /dev/null @@ -1,18 +0,0 @@ -from tweepy import AppAuthHandler, OAuthHandler, API -from .shared import base_folder, twitterAuth -import json -import os - - -def get_auth_app(): - ad = twitterAuth['api'] - return AppAuthHandler(ad['key'], ad['secret']) - -def get_api_app(): - return API(get_auth_app()) - -def get_auth_user(): - ad = twitterAuth - auth = OAuthHandler(ad['api']['key'], ad['api']['secret']) - auth.set_access_token(ad['access']['token'], ad['access']['token-secret']) - return auth diff --git a/requirements.txt b/requirements.txt index 0054864..f33961f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,40 +1,63 @@ -backcall==0.1.0 -certifi==2020.4.5.1 -chardet==3.0.4 -click==7.1.2 -cssselect==1.1.0 -decorator==4.4.2 +# Core Flask and web framework Flask==1.1.2 Flask-Cors==3.0.8 +Werkzeug>=0.15 + +# Mastodon integration +Mastodon.py==1.8.1 +blurhash>=1.1.4 +python-magic>=0.4.27 + +# Twitter integration GetOldTweets3==0.0.11 -idna==2.9 -ipykernel==5.2.1 -ipython==7.14.0 -ipython-genutils==0.2.0 -itsdangerous==1.1.0 -jedi==0.17.0 -Jinja2==2.11.2 -jupyter-client==6.1.3 -jupyter-core==4.6.3 -lxml==4.5.1 -MarkupSafe==1.1.1 -oauthlib==3.1.0 -parso==0.7.0 -pexpect==4.8.0 -pickleshare==0.7.5 -prompt-toolkit==3.0.5 -ptyprocess==0.6.0 -Pygments==2.6.1 -pyquery==1.4.1 -PySocks==1.7.1 -python-dateutil==2.8.1 -pyzmq==19.0.1 -requests==2.23.0 -requests-oauthlib==1.3.0 -six==1.14.0 -tornado==6.0.4 -traitlets==4.3.3 tweepy==3.8.0 -urllib3==1.25.9 -wcwidth==0.1.9 -Werkzeug==1.0.1 + +# Bluesky integration +websocket-client==1.6.4 + +# HTTP and networking +requests>=2.23.0 +requests-oauthlib>=0.7.0 +oauthlib>=3.0.0 +urllib3>=1.25.9 +certifi>=2017.4.17 +PySocks>=1.5.7 + +# Web scraping and parsing +lxml>=4.5.1 +pyquery>=1.2.10 +cssselect>=1.1.0 + +# Utilities +python-dateutil>=2.8.1 +click>=7.1.2 +six>=1.14.0 +chardet>=3.0.2 +idna>=2.5 +decorator>=4.4.2 + +# Template engine +Jinja2>=2.10.1 +MarkupSafe>=1.1.1 +itsdangerous>=0.24 + +# Optional: Development and notebook support (install only if needed) +# ipython>=7.14.0 +# ipykernel>=5.2.1 +# jupyter-client>=6.1.3 +# jupyter-core>=4.6.3 +# ipython-genutils>=0.2.0 +# prompt-toolkit>=3.0.5 +# Pygments>=2.6.1 +# backcall>=0.1.0 +# pexpect>=4.8.0 +# pickleshare>=0.7.5 +# ptyprocess>=0.6.0 +# parso>=0.7.0 +# jedi>=0.17.0 +# wcwidth>=0.1.9 +# traitlets>=4.3.3 +# tornado>=6.0.4 + +# Note: pyzmq has compatibility issues with Python 3.13 and is not required for core functionality +# pyzmq==19.0.1 # Commented out due to Python 3.13 compatibility issues \ No newline at end of file diff --git a/runInit.py b/runInit.py index c1ca65b..2f7af00 100755 --- a/runInit.py +++ b/runInit.py @@ -1,11 +1,10 @@ #!./venv/bin/python -import json import os import logging import GetOldTweets3 as got from lib.shared import base_folder, tweetsFetchSettings -from lib.tweets_aux import getOldTweetIds, populateTweetsFolder +from lib.toots_aux import getOldTweetIds, populateTweetsFolder logging.basicConfig( level=logging.INFO, diff --git a/runListener.py b/runListener.py index aa064fa..925f3f5 100755 --- a/runListener.py +++ b/runListener.py @@ -1,14 +1,52 @@ -#!./venv/bin/python -import json -import tweepy +##!./venv/bin/python + import os import logging +import time +import threading +import argparse +from mastodon import StreamListener + +from lib.shared import base_folder, tootsFetchSettings, toots_folder +try: + from lib.mastodon_auth import get_auth_user +except Exception as e: + logging.warning(f"Mastodon auth not configured: {e}") + get_auth_user = None +from lib.mastodon_base import writeTootToFolder, writeTootToArchive, readTootFromFolder, deleteTootFromFolder +from lib.TootForest import TootForest +from lib.bluesky_base import BlueSkyStreamListener + +# Parse command line arguments +parser = argparse.ArgumentParser(description='Social Media Listener') +parser.add_argument('--platforms', type=str, default='all', + help='Platforms to listen to: all, mastodon, bluesky, or comma-separated list (e.g., mastodon,bluesky)') +parser.add_argument('--mastodon-only', action='store_true', help='Listen only to Mastodon') +parser.add_argument('--bluesky-only', action='store_true', help='Listen only to Bluesky') +args = parser.parse_args() + +# Determine which platforms to enable +if args.mastodon_only: + enable_mastodon = True + enable_bluesky = False +elif args.bluesky_only: + enable_mastodon = False + enable_bluesky = True +elif args.platforms.lower() == 'all': + enable_mastodon = True + enable_bluesky = True +else: + platforms = [p.strip().lower() for p in args.platforms.split(',')] + enable_mastodon = 'mastodon' in platforms + enable_bluesky = 'bluesky' in platforms -from lib.shared import base_folder, tweetsFetchSettings -from lib.twitter_auth import get_auth_user -from lib.tweets_base import writeTweetToFolder -from lib.TweetForest import TweetForest +mastodon = get_auth_user() if get_auth_user and enable_mastodon else None +keyword = tootsFetchSettings['listen'] + +dir_path = os.path.join(toots_folder, tootsFetchSettings['folder']) +files = os.listdir(dir_path) +existing_ids = [os.path.splitext(file)[0] for file in files] logging.basicConfig( level=logging.INFO, @@ -19,36 +57,264 @@ ] ) -searchString = tweetsFetchSettings['listen'] +searchString = tootsFetchSettings['listen'] def update(): - forest = TweetForest.fromFolder(tweetsFetchSettings['folder']) - forest.saveApiJson(tweetsFetchSettings['file']) - -class DecarbnowStreamListener(tweepy.StreamListener): - def on_status(self, tweet): - id = tweet._json['id'] - logging.info('Listener got new tweet: {}'.format(id)) - writeTweetToFolder(tweet._json, tweetsFetchSettings['folder']) + global existing_ids + forest = TootForest.fromFolder(tootsFetchSettings['folder']) + forest.saveApiJson(tootsFetchSettings['file']) + + # Update the global existing_ids variable + files = os.listdir(dir_path) + existing_ids = [os.path.splitext(file)[0] for file in files] + +def handle_bluesky_post(bluesky_post): + """Handle incoming Bluesky posts, converting them to toot format and saving""" + global existing_ids + id = str(bluesky_post['id']) + if keyword in bluesky_post['content']: + logging.info('Listener got new Bluesky post: {}'.format(id)) + + # Check for story threading (same user replying to themselves with libmap link) + if (bluesky_post.get('in_reply_to_id') and + bluesky_post.get('in_reply_to_account_id') and + bluesky_post.get('account', {}).get('acct') == bluesky_post.get('in_reply_to_account_id')): + + # This is a self-reply, check if the parent has a libmap link and is in our collection + parent_id = str(bluesky_post['in_reply_to_id']) + if parent_id in existing_ids: + try: + parent_toot = readTootFromFolder(parent_id, tootsFetchSettings['folder']) + if keyword in parent_toot.get('content', ''): + # Parent has libmap link, this should be part of a story + # Find the root of the story by following the chain + root_id = parent_id + while True: + try: + root_toot = readTootFromFolder(root_id, tootsFetchSettings['folder']) + if (root_toot.get('in_reply_to_id') and + str(root_toot.get('in_reply_to_account_id')) == str(bluesky_post.get('account', {}).get('acct')) and + str(root_toot['in_reply_to_id']) in existing_ids): + root_id = str(root_toot['in_reply_to_id']) + else: + break + except: + break + + # Add story field to the post before saving + bluesky_post['story'] = root_id + logging.info('Added Bluesky story field: {} -> {}'.format(id, root_id)) + except Exception as e: + logging.warning('Error checking Bluesky story threading: {}'.format(e)) + + writeTootToFolder(bluesky_post, tootsFetchSettings['folder']) + # Update existing_ids immediately to include the new post + existing_ids.append(id) update() - def on_error(self, status_code): - logging.warning('Listener got error, status code: {}'.format(status_code)) - return True +def handle_bluesky_deletion(post_id): + """Handle Bluesky post deletions - only archive posts we have captured""" + global existing_ids + str_status = str(post_id) + if str_status in existing_ids: + # This is a post we actually captured and stored + logging.info('Archiving Bluesky post (id: {})!'.format(str_status)) + toot = readTootFromFolder(str_status, tootsFetchSettings['folder']) + archive_dir = os.path.join(toots_folder, tootsFetchSettings['folder'], 'archive') + os.makedirs(archive_dir, exist_ok=True) + + writeTootToArchive(toot, tootsFetchSettings['folder']) + deleteTootFromFolder(post_id, tootsFetchSettings['folder']) + # Remove from existing_ids immediately + existing_ids.remove(str_status) + update() + # If post_id not in existing_ids, we silently ignore it (not relevant to us) + + +class MastodonStreamListener(StreamListener): + def __init__(self, mastodon): + self.mastodon = mastodon + self.receivedHeartbeat = False + + def stream_with_reconnection(self): + retry_delay = 10 + max_retries = 10 + retry_count = 0 + + while retry_count < max_retries: + try: + self.mastodon.stream_public(self) + retry_count = 0 + break + except Exception as e: + self.receivedHeartbeat = False + logging.warning(f"Error: {e}") + logging.info("Trying to reconnect after " + str(retry_delay) + " seconds.") + retry_count += 1 + time.sleep(retry_delay) + def on_update(self, toot): + global existing_ids + if keyword in toot['content']: + id = str(toot['id']) # Convert the ID to a string, if it's not already + logging.info('Listener got new toot: {}'.format(id)) + + # Check for story threading (same user replying to themselves with libmap link) + if (toot.get('in_reply_to_id') and + toot.get('in_reply_to_account_id') and + toot.get('account', {}).get('id') == toot.get('in_reply_to_account_id')): + + # This is a self-reply, check if the parent has a libmap link and is in our collection + parent_id = str(toot['in_reply_to_id']) + if parent_id in existing_ids: + try: + parent_toot = readTootFromFolder(parent_id, tootsFetchSettings['folder']) + if keyword in parent_toot.get('content', ''): + # Parent has libmap link, this should be part of a story + # Find the root of the story by following the chain + root_id = parent_id + while True: + try: + root_toot = readTootFromFolder(root_id, tootsFetchSettings['folder']) + if (root_toot.get('in_reply_to_id') and + str(root_toot.get('in_reply_to_account_id')) == str(toot.get('account', {}).get('id')) and + str(root_toot['in_reply_to_id']) in existing_ids): + root_id = str(root_toot['in_reply_to_id']) + else: + break + except: + break + + # Add story field to the toot before saving + toot['story'] = root_id + logging.info('Added story field: {} -> {}'.format(id, root_id)) + except Exception as e: + logging.warning('Error checking story threading: {}'.format(e)) + + writeTootToFolder(toot, tootsFetchSettings['folder']) + # Update existing_ids immediately to include the new post + if id not in existing_ids: + existing_ids.append(id) + update() + + def on_status_update(self, toot): + global existing_ids + if keyword in toot['content']: + id = str(toot['id']) + logging.info('Listener got update of toot: {}'.format(id)) + + # Check for story threading (same user replying to themselves with libmap link) + if (toot.get('in_reply_to_id') and + toot.get('in_reply_to_account_id') and + toot.get('account', {}).get('id') == toot.get('in_reply_to_account_id')): + + # This is a self-reply, check if the parent has a libmap link and is in our collection + parent_id = str(toot['in_reply_to_id']) + if parent_id in existing_ids: + try: + parent_toot = readTootFromFolder(parent_id, tootsFetchSettings['folder']) + if keyword in parent_toot.get('content', ''): + # Parent has libmap link, this should be part of a story + # Find the root of the story by following the chain + root_id = parent_id + while True: + try: + root_toot = readTootFromFolder(root_id, tootsFetchSettings['folder']) + if (root_toot.get('in_reply_to_id') and + str(root_toot.get('in_reply_to_account_id')) == str(toot.get('account', {}).get('id')) and + str(root_toot['in_reply_to_id']) in existing_ids): + root_id = str(root_toot['in_reply_to_id']) + else: + break + except: + break + + # Add story field to the toot before saving + toot['story'] = root_id + logging.info('Added story field to updated toot: {} -> {}'.format(id, root_id)) + except Exception as e: + logging.warning('Error checking story threading on update: {}'.format(e)) + + writeTootToFolder(toot, tootsFetchSettings['folder']) + # Update existing_ids immediately to include the new post + if id not in existing_ids: + existing_ids.append(id) + update() + + def on_delete(self, status_id): + global existing_ids + str_status = str(status_id) + if str_status in existing_ids: + logging.info('Archiving toot (id: {})!'.format(str_status)) + toot = readTootFromFolder(str_status) + archive_dir = os.path.join(toots_folder, tootsFetchSettings['folder'], 'archive') + os.makedirs(archive_dir, exist_ok=True) + + writeTootToArchive(toot, tootsFetchSettings['folder']) + deleteTootFromFolder(status_id, tootsFetchSettings['folder']) + # Remove from existing_ids immediately + existing_ids.remove(str_status) + update() + + def handle_heartbeat(self): + if not self.receivedHeartbeat: + logging.info("Connected to server. Listening.") + self.receivedHeartbeat = True + + def on_abort(self, status_code): + logging.warning('Listener got error, status code: {}'.format(status_code)) + self.stream_with_reconnection() logging.info('Init Listener:') -logging.info(' - Tweets Folder: \'{}\''.format(tweetsFetchSettings['folder'])) +logging.info(' - Toots Folder: \'{}\''.format(tootsFetchSettings['folder'])) logging.info(' - Search String: \'{}\''.format(searchString)) +logging.info(' - Enabled Platforms: Mastodon={}, Bluesky={}'.format(enable_mastodon, enable_bluesky)) + +listener = MastodonStreamListener(mastodon) if mastodon and enable_mastodon else None -listener = DecarbnowStreamListener() -stream = tweepy.Stream(auth = get_auth_user(), listener = listener, tweet_mode = 'extended') +# Initialize Bluesky listener if enabled +# Use the same keyword as Mastodon for consistency +keywords = [searchString] # Keep the full search string as one keyword +bluesky_listener = BlueSkyStreamListener(keywords, handle_bluesky_post) if enable_bluesky else None -logging.info('Init Tweets API File ...') +# Set up deletion callback for Bluesky +if bluesky_listener: + bluesky_listener.set_deletion_callback(handle_bluesky_deletion) + +logging.info('Init Toots API File ...') update() -logging.info('Start Listener ...') -try: - stream.filter(track=[searchString]) -except KeyboardInterrupt: - pass +logging.info('Start Listeners ...') + +# Start Bluesky listener in a separate thread if enabled +bluesky_thread = None +if bluesky_listener: + bluesky_thread = threading.Thread(target=bluesky_listener.start_stream, daemon=True) + bluesky_thread.start() + logging.info('Bluesky listener started in background thread') +else: + logging.info('Bluesky listener disabled') + +# Start Mastodon listener (blocks) if enabled +if listener: + logging.info('Starting Mastodon listener...') + try: + listener.stream_with_reconnection() + except KeyboardInterrupt: + logging.info('Shutting down listeners...') + if bluesky_listener: + bluesky_listener.stop_stream() + raise +else: + logging.info('Mastodon listener disabled') + if bluesky_listener: + logging.info('Running with Bluesky only - press Ctrl+C to stop') + try: + # Keep the main thread alive when only Bluesky is running + while True: + time.sleep(1) + except KeyboardInterrupt: + logging.info('Shutting down Bluesky listener...') + bluesky_listener.stop_stream() + else: + logging.error('No listeners enabled! Use --help for usage information.') \ No newline at end of file diff --git a/sync.sh b/sync.sh new file mode 100755 index 0000000..cf3bb0d --- /dev/null +++ b/sync.sh @@ -0,0 +1,2 @@ +#!/usr/bin/env bash +rsync -a --exclude='/data' . nebula@spica.uberspace.de:~/html/dev2.decarbnow.space