diff --git a/.gitignore b/.gitignore index 08c323a..658fb84 100644 --- a/.gitignore +++ b/.gitignore @@ -52,3 +52,5 @@ temp/ # Test outputs out/ +outberduck/ +.splice/ diff --git a/README.md b/README.md index 5bd5231..61aff40 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ Convert social/chat archives into normalized threads and export to Markdown, OAI - Idiomatic CLI (clig.dev principles) - Modular architecture: - - sources: Twitter/X today; Bluesky, ChatGPT, etc. next + - sources: Twitter/X archives and Bluesky repo CAR exports (text-first; blobs soon), ChatGPT, etc. next - transforms: filtering, grouping into threads/conversations, text cleaning - outputs: Markdown, OAI JSONL, JSONL (normalized items), ShareGPT - Library API to compose your own pipeline or plug in proprietary adapters @@ -147,6 +147,10 @@ We currently ingest: - Tweets (YTD `tweets`) and Likes (YTD `like`) - Media files prefixed with `-*` in `data/tweets_media/` +Bluesky/AT Protocol: +- Pass `--source path/to/repo-export.car`. We load `app.bsky.feed.post` records from the CAR. +- Media blobs are referenced (with blob CID + alt text) but not downloaded yet; they’ll show up when we add a blob fetch step. + ## Output layout On a successful run, you’ll see: diff --git a/package-lock.json b/package-lock.json index e9b1e8b..6766c0d 100644 --- a/package-lock.json +++ b/package-lock.json @@ -9,6 +9,8 @@ "version": "0.1.1", "license": "MIT", "dependencies": { + "@atproto/api": "^0.18.0", + "@atproto/repo": "^0.8.10", "cosmiconfig": "^9.0.0" }, "bin": { @@ -25,6 +27,114 @@ "node": ">=18" } }, + "node_modules/@atproto/api": { + "version": "0.18.0", + "resolved": "https://registry.npmjs.org/@atproto/api/-/api-0.18.0.tgz", + "integrity": "sha512-2GxKPhhvMocDjRU7VpNj+cvCdmCHVAmRwyfNgRLMrJtPZvrosFoi9VATX+7eKN0FZvYvy8KdLSkCcpP2owH3IA==", + "license": "MIT", + "dependencies": { + "@atproto/common-web": "^0.4.3", + "@atproto/lexicon": "^0.5.1", + "@atproto/syntax": "^0.4.1", + "@atproto/xrpc": "^0.7.5", + "await-lock": "^2.2.2", + "multiformats": "^9.9.0", + "tlds": "^1.234.0", + "zod": "^3.23.8" + } + }, + "node_modules/@atproto/common": { + "version": "0.4.12", + "resolved": "https://registry.npmjs.org/@atproto/common/-/common-0.4.12.tgz", + "integrity": "sha512-NC+TULLQiqs6MvNymhQS5WDms3SlbIKGLf4n33tpftRJcalh507rI+snbcUb7TLIkKw7VO17qMqxEXtIdd5auQ==", + "license": "MIT", + "dependencies": { + "@atproto/common-web": "^0.4.3", + "@ipld/dag-cbor": "^7.0.3", + "cbor-x": "^1.5.1", + "iso-datestring-validator": "^2.2.2", + "multiformats": "^9.9.0", + "pino": "^8.21.0" + }, + "engines": { + "node": ">=18.7.0" + } + }, + "node_modules/@atproto/common-web": { + "version": "0.4.3", + "resolved": "https://registry.npmjs.org/@atproto/common-web/-/common-web-0.4.3.tgz", + "integrity": "sha512-nRDINmSe4VycJzPo6fP/hEltBcULFxt9Kw7fQk6405FyAWZiTluYHlXOnU7GkQfeUK44OENG1qFTBcmCJ7e8pg==", + "license": "MIT", + "dependencies": { + "graphemer": "^1.4.0", + "multiformats": "^9.9.0", + "uint8arrays": "3.0.0", + "zod": "^3.23.8" + } + }, + "node_modules/@atproto/crypto": { + "version": "0.4.4", + "resolved": "https://registry.npmjs.org/@atproto/crypto/-/crypto-0.4.4.tgz", + "integrity": "sha512-Yq9+crJ7WQl7sxStVpHgie5Z51R05etaK9DLWYG/7bR5T4bhdcIgF6IfklLShtZwLYdVVj+K15s0BqW9a8PSDA==", + "license": "MIT", + "dependencies": { + "@noble/curves": "^1.7.0", + "@noble/hashes": "^1.6.1", + "uint8arrays": "3.0.0" + }, + "engines": { + "node": ">=18.7.0" + } + }, + "node_modules/@atproto/lexicon": { + "version": "0.5.1", + "resolved": "https://registry.npmjs.org/@atproto/lexicon/-/lexicon-0.5.1.tgz", + "integrity": "sha512-y8AEtYmfgVl4fqFxqXAeGvhesiGkxiy3CWoJIfsFDDdTlZUC8DFnZrYhcqkIop3OlCkkljvpSJi1hbeC1tbi8A==", + "license": "MIT", + "dependencies": { + "@atproto/common-web": "^0.4.3", + "@atproto/syntax": "^0.4.1", + "iso-datestring-validator": "^2.2.2", + "multiformats": "^9.9.0", + "zod": "^3.23.8" + } + }, + "node_modules/@atproto/repo": { + "version": "0.8.10", + "resolved": "https://registry.npmjs.org/@atproto/repo/-/repo-0.8.10.tgz", + "integrity": "sha512-REs6TZGyxNaYsjqLf447u+gSdyzhvMkVbxMBiKt1ouEVRkiho1CY32+omn62UkpCuGK2y6SCf6x3sVMctgmX4g==", + "license": "MIT", + "dependencies": { + "@atproto/common": "^0.4.12", + "@atproto/common-web": "^0.4.3", + "@atproto/crypto": "^0.4.4", + "@atproto/lexicon": "^0.5.1", + "@ipld/dag-cbor": "^7.0.0", + "multiformats": "^9.9.0", + "uint8arrays": "3.0.0", + "varint": "^6.0.0", + "zod": "^3.23.8" + }, + "engines": { + "node": ">=18.7.0" + } + }, + "node_modules/@atproto/syntax": { + "version": "0.4.1", + "resolved": "https://registry.npmjs.org/@atproto/syntax/-/syntax-0.4.1.tgz", + "integrity": "sha512-CJdImtLAiFO+0z3BWTtxwk6aY5w4t8orHTMVJgkf++QRJWTxPbIFko/0hrkADB7n2EruDxDSeAgfUGehpH6ngw==", + "license": "MIT" + }, + "node_modules/@atproto/xrpc": { + "version": "0.7.5", + "resolved": "https://registry.npmjs.org/@atproto/xrpc/-/xrpc-0.7.5.tgz", + "integrity": "sha512-MUYNn5d2hv8yVegRL0ccHvTHAVj5JSnW07bkbiaz96UH45lvYNRVwt44z+yYVnb0/mvBzyD3/ZQ55TRGt7fHkA==", + "license": "MIT", + "dependencies": { + "@atproto/lexicon": "^0.5.1", + "zod": "^3.23.8" + } + }, "node_modules/@babel/code-frame": { "version": "7.27.1", "resolved": "https://registry.npmjs.org/@babel/code-frame/-/code-frame-7.27.1.tgz", @@ -48,6 +158,84 @@ "node": ">=6.9.0" } }, + "node_modules/@cbor-extract/cbor-extract-darwin-arm64": { + "version": "2.2.0", + "resolved": "https://registry.npmjs.org/@cbor-extract/cbor-extract-darwin-arm64/-/cbor-extract-darwin-arm64-2.2.0.tgz", + "integrity": "sha512-P7swiOAdF7aSi0H+tHtHtr6zrpF3aAq/W9FXx5HektRvLTM2O89xCyXF3pk7pLc7QpaY7AoaE8UowVf9QBdh3w==", + "cpu": [ + "arm64" + ], + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ] + }, + "node_modules/@cbor-extract/cbor-extract-darwin-x64": { + "version": "2.2.0", + "resolved": "https://registry.npmjs.org/@cbor-extract/cbor-extract-darwin-x64/-/cbor-extract-darwin-x64-2.2.0.tgz", + "integrity": "sha512-1liF6fgowph0JxBbYnAS7ZlqNYLf000Qnj4KjqPNW4GViKrEql2MgZnAsExhY9LSy8dnvA4C0qHEBgPrll0z0w==", + "cpu": [ + "x64" + ], + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ] + }, + "node_modules/@cbor-extract/cbor-extract-linux-arm": { + "version": "2.2.0", + "resolved": "https://registry.npmjs.org/@cbor-extract/cbor-extract-linux-arm/-/cbor-extract-linux-arm-2.2.0.tgz", + "integrity": "sha512-QeBcBXk964zOytiedMPQNZr7sg0TNavZeuUCD6ON4vEOU/25+pLhNN6EDIKJ9VLTKaZ7K7EaAriyYQ1NQ05s/Q==", + "cpu": [ + "arm" + ], + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@cbor-extract/cbor-extract-linux-arm64": { + "version": "2.2.0", + "resolved": "https://registry.npmjs.org/@cbor-extract/cbor-extract-linux-arm64/-/cbor-extract-linux-arm64-2.2.0.tgz", + "integrity": "sha512-rQvhNmDuhjTVXSPFLolmQ47/ydGOFXtbR7+wgkSY0bdOxCFept1hvg59uiLPT2fVDuJFuEy16EImo5tE2x3RsQ==", + "cpu": [ + "arm64" + ], + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@cbor-extract/cbor-extract-linux-x64": { + "version": "2.2.0", + "resolved": "https://registry.npmjs.org/@cbor-extract/cbor-extract-linux-x64/-/cbor-extract-linux-x64-2.2.0.tgz", + "integrity": "sha512-cWLAWtT3kNLHSvP4RKDzSTX9o0wvQEEAj4SKvhWuOVZxiDAeQazr9A+PSiRILK1VYMLeDml89ohxCnUNQNQNCw==", + "cpu": [ + "x64" + ], + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@cbor-extract/cbor-extract-win32-x64": { + "version": "2.2.0", + "resolved": "https://registry.npmjs.org/@cbor-extract/cbor-extract-win32-x64/-/cbor-extract-win32-x64-2.2.0.tgz", + "integrity": "sha512-l2M+Z8DO2vbvADOBNLbbh9y5ST1RY5sqkWOg/58GkUPBYou/cuNZ68SGQ644f1CvZ8kcOxyZtw06+dxWHIoN/w==", + "cpu": [ + "x64" + ], + "license": "MIT", + "optional": true, + "os": [ + "win32" + ] + }, "node_modules/@esbuild/aix-ppc64": { "version": "0.25.10", "resolved": "https://registry.npmjs.org/@esbuild/aix-ppc64/-/aix-ppc64-0.25.10.tgz", @@ -490,6 +678,16 @@ "node": ">=18" } }, + "node_modules/@ipld/dag-cbor": { + "version": "7.0.3", + "resolved": "https://registry.npmjs.org/@ipld/dag-cbor/-/dag-cbor-7.0.3.tgz", + "integrity": "sha512-1VVh2huHsuohdXC1bGJNE8WR72slZ9XE2T3wbBBq31dm7ZBatmKLLxrB+XAqafxfRFjv08RZmj/W/ZqaM13AuA==", + "license": "(Apache-2.0 AND MIT)", + "dependencies": { + "cborg": "^1.6.0", + "multiformats": "^9.5.4" + } + }, "node_modules/@jridgewell/sourcemap-codec": { "version": "1.5.5", "resolved": "https://registry.npmjs.org/@jridgewell/sourcemap-codec/-/sourcemap-codec-1.5.5.tgz", @@ -497,6 +695,33 @@ "dev": true, "license": "MIT" }, + "node_modules/@noble/curves": { + "version": "1.9.7", + "resolved": "https://registry.npmjs.org/@noble/curves/-/curves-1.9.7.tgz", + "integrity": "sha512-gbKGcRUYIjA3/zCCNaWDciTMFI0dCkvou3TL8Zmy5Nc7sJ47a0jtOeZoTaMxkuqRo9cRhjOdZJXegxYE5FN/xw==", + "license": "MIT", + "dependencies": { + "@noble/hashes": "1.8.0" + }, + "engines": { + "node": "^14.21.3 || >=16" + }, + "funding": { + "url": "https://paulmillr.com/funding/" + } + }, + "node_modules/@noble/hashes": { + "version": "1.8.0", + "resolved": "https://registry.npmjs.org/@noble/hashes/-/hashes-1.8.0.tgz", + "integrity": "sha512-jCs9ldd7NwzpgXDIf6P3+NrHh9/sD6CQdxHyjQI+h/6rDNo88ypBxxz45UDuZHz9r3tNz7N/VInSVoVdtXEI4A==", + "license": "MIT", + "engines": { + "node": "^14.21.3 || >=16" + }, + "funding": { + "url": "https://paulmillr.com/funding/" + } + }, "node_modules/@rollup/rollup-android-arm-eabi": { "version": "4.52.4", "resolved": "https://registry.npmjs.org/@rollup/rollup-android-arm-eabi/-/rollup-android-arm-eabi-4.52.4.tgz", @@ -955,6 +1180,18 @@ "url": "https://opencollective.com/vitest" } }, + "node_modules/abort-controller": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/abort-controller/-/abort-controller-3.0.0.tgz", + "integrity": "sha512-h8lQ8tacZYnR3vNQTgibj+tODHI5/+l06Au2Pcriv/Gmet0eaj4TwWH41sO9wnHDiQsEj19q0drzdWdeAHtweg==", + "license": "MIT", + "dependencies": { + "event-target-shim": "^5.0.0" + }, + "engines": { + "node": ">=6.5" + } + }, "node_modules/argparse": { "version": "2.0.1", "resolved": "https://registry.npmjs.org/argparse/-/argparse-2.0.1.tgz", @@ -971,6 +1208,65 @@ "node": ">=12" } }, + "node_modules/atomic-sleep": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/atomic-sleep/-/atomic-sleep-1.0.0.tgz", + "integrity": "sha512-kNOjDqAh7px0XWNI+4QbzoiR/nTkHAWNud2uvnJquD1/x5a7EQZMJT0AczqK0Qn67oY/TTQ1LbUKajZpp3I9tQ==", + "license": "MIT", + "engines": { + "node": ">=8.0.0" + } + }, + "node_modules/await-lock": { + "version": "2.2.2", + "resolved": "https://registry.npmjs.org/await-lock/-/await-lock-2.2.2.tgz", + "integrity": "sha512-aDczADvlvTGajTDjcjpJMqRkOF6Qdz3YbPZm/PyW6tKPkx2hlYBzxMhEywM/tU72HrVZjgl5VCdRuMlA7pZ8Gw==", + "license": "MIT" + }, + "node_modules/base64-js": { + "version": "1.5.1", + "resolved": "https://registry.npmjs.org/base64-js/-/base64-js-1.5.1.tgz", + "integrity": "sha512-AKpaYlHn8t4SVbOHCy+b5+KKgvR4vrsD8vbvrbiQJps7fKDTkjkDry6ji0rUJjC0kzbNePLwzxq8iypo41qeWA==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/feross" + }, + { + "type": "patreon", + "url": "https://www.patreon.com/feross" + }, + { + "type": "consulting", + "url": "https://feross.org/support" + } + ], + "license": "MIT" + }, + "node_modules/buffer": { + "version": "6.0.3", + "resolved": "https://registry.npmjs.org/buffer/-/buffer-6.0.3.tgz", + "integrity": "sha512-FTiCpNxtwiZZHEZbcbTIcZjERVICn9yq/pDFkTl95/AxzD1naBctN7YO68riM/gLSDY7sdrMby8hofADYuuqOA==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/feross" + }, + { + "type": "patreon", + "url": "https://www.patreon.com/feross" + }, + { + "type": "consulting", + "url": "https://feross.org/support" + } + ], + "license": "MIT", + "dependencies": { + "base64-js": "^1.3.1", + "ieee754": "^1.2.1" + } + }, "node_modules/cac": { "version": "6.7.14", "resolved": "https://registry.npmjs.org/cac/-/cac-6.7.14.tgz", @@ -990,6 +1286,46 @@ "node": ">=6" } }, + "node_modules/cbor-extract": { + "version": "2.2.0", + "resolved": "https://registry.npmjs.org/cbor-extract/-/cbor-extract-2.2.0.tgz", + "integrity": "sha512-Ig1zM66BjLfTXpNgKpvBePq271BPOvu8MR0Jl080yG7Jsl+wAZunfrwiwA+9ruzm/WEdIV5QF/bjDZTqyAIVHA==", + "hasInstallScript": true, + "license": "MIT", + "optional": true, + "dependencies": { + "node-gyp-build-optional-packages": "5.1.1" + }, + "bin": { + "download-cbor-prebuilds": "bin/download-prebuilds.js" + }, + "optionalDependencies": { + "@cbor-extract/cbor-extract-darwin-arm64": "2.2.0", + "@cbor-extract/cbor-extract-darwin-x64": "2.2.0", + "@cbor-extract/cbor-extract-linux-arm": "2.2.0", + "@cbor-extract/cbor-extract-linux-arm64": "2.2.0", + "@cbor-extract/cbor-extract-linux-x64": "2.2.0", + "@cbor-extract/cbor-extract-win32-x64": "2.2.0" + } + }, + "node_modules/cbor-x": { + "version": "1.6.0", + "resolved": "https://registry.npmjs.org/cbor-x/-/cbor-x-1.6.0.tgz", + "integrity": "sha512-0kareyRwHSkL6ws5VXHEf8uY1liitysCVJjlmhaLG+IXLqhSaOO+t63coaso7yjwEzWZzLy8fJo06gZDVQM9Qg==", + "license": "MIT", + "optionalDependencies": { + "cbor-extract": "^2.2.0" + } + }, + "node_modules/cborg": { + "version": "1.10.2", + "resolved": "https://registry.npmjs.org/cborg/-/cborg-1.10.2.tgz", + "integrity": "sha512-b3tFPA9pUr2zCUiCfRd2+wok2/LBSNUMKOuRRok+WlvvAgEt/PlbgPTsZUcwCOs53IJvLgTp0eotwtosE6njug==", + "license": "Apache-2.0", + "bin": { + "cborg": "cli.js" + } + }, "node_modules/chai": { "version": "5.3.3", "resolved": "https://registry.npmjs.org/chai/-/chai-5.3.3.tgz", @@ -1086,6 +1422,16 @@ "node": ">=6" } }, + "node_modules/detect-libc": { + "version": "2.1.2", + "resolved": "https://registry.npmjs.org/detect-libc/-/detect-libc-2.1.2.tgz", + "integrity": "sha512-Btj2BOOO83o3WyH59e8MgXsxEQVcarkUOpEYrubB0urwnN10yQ364rsiByU11nZlqWYZm05i/of7io4mzihBtQ==", + "license": "Apache-2.0", + "optional": true, + "engines": { + "node": ">=8" + } + }, "node_modules/env-paths": { "version": "2.2.1", "resolved": "https://registry.npmjs.org/env-paths/-/env-paths-2.2.1.tgz", @@ -1163,6 +1509,24 @@ "@types/estree": "^1.0.0" } }, + "node_modules/event-target-shim": { + "version": "5.0.1", + "resolved": "https://registry.npmjs.org/event-target-shim/-/event-target-shim-5.0.1.tgz", + "integrity": "sha512-i/2XbnSz/uxRCU6+NdVJgKWDTM427+MqYbkQzD321DuCQJUqOuJKIA0IM2+W2xtYHdKOmZ4dR6fExsd4SXL+WQ==", + "license": "MIT", + "engines": { + "node": ">=6" + } + }, + "node_modules/events": { + "version": "3.3.0", + "resolved": "https://registry.npmjs.org/events/-/events-3.3.0.tgz", + "integrity": "sha512-mQw+2fkQbALzQ7V0MY0IqdnXNOeTtP4r0lN9z7AAawCXgqea7bDii20AYrIBrFd/Hx0M2Ocz6S111CaFkUcb0Q==", + "license": "MIT", + "engines": { + "node": ">=0.8.x" + } + }, "node_modules/execa": { "version": "9.6.0", "resolved": "https://registry.npmjs.org/execa/-/execa-9.6.0.tgz", @@ -1200,6 +1564,15 @@ "node": ">=12.0.0" } }, + "node_modules/fast-redact": { + "version": "3.5.0", + "resolved": "https://registry.npmjs.org/fast-redact/-/fast-redact-3.5.0.tgz", + "integrity": "sha512-dwsoQlS7h9hMeYUq1W++23NDcBLV4KqONnITDV9DjfS3q1SgDGVrBdvvTLUotWtPSD7asWDV9/CmsZPy8Hf70A==", + "license": "MIT", + "engines": { + "node": ">=6" + } + }, "node_modules/figures": { "version": "6.1.0", "resolved": "https://registry.npmjs.org/figures/-/figures-6.1.0.tgz", @@ -1261,6 +1634,12 @@ "url": "https://github.com/privatenumber/get-tsconfig?sponsor=1" } }, + "node_modules/graphemer": { + "version": "1.4.0", + "resolved": "https://registry.npmjs.org/graphemer/-/graphemer-1.4.0.tgz", + "integrity": "sha512-EtKwoO6kxCL9WO5xipiHTZlSzBm7WLT627TqC/uVRd0HKmq8NXyebnNYxDoBi7wt8eTWrUrKXCOVaFq9x1kgag==", + "license": "MIT" + }, "node_modules/human-signals": { "version": "8.0.1", "resolved": "https://registry.npmjs.org/human-signals/-/human-signals-8.0.1.tgz", @@ -1271,6 +1650,26 @@ "node": ">=18.18.0" } }, + "node_modules/ieee754": { + "version": "1.2.1", + "resolved": "https://registry.npmjs.org/ieee754/-/ieee754-1.2.1.tgz", + "integrity": "sha512-dcyqhDvX1C46lXZcVqCpK+FtMRQVdIMN6/Df5js2zouUsqG7I6sFxitIC+7KYK29KdXOLHdu9zL4sFnoVQnqaA==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/feross" + }, + { + "type": "patreon", + "url": "https://www.patreon.com/feross" + }, + { + "type": "consulting", + "url": "https://feross.org/support" + } + ], + "license": "BSD-3-Clause" + }, "node_modules/import-fresh": { "version": "3.3.1", "resolved": "https://registry.npmjs.org/import-fresh/-/import-fresh-3.3.1.tgz", @@ -1339,6 +1738,12 @@ "dev": true, "license": "ISC" }, + "node_modules/iso-datestring-validator": { + "version": "2.2.2", + "resolved": "https://registry.npmjs.org/iso-datestring-validator/-/iso-datestring-validator-2.2.2.tgz", + "integrity": "sha512-yLEMkBbLZTlVQqOnQ4FiMujR6T4DEcCb1xizmvXS+OxuhwcbtynoosRzdMA69zZCShCNAbi+gJ71FxZBBXx1SA==", + "license": "MIT" + }, "node_modules/js-tokens": { "version": "4.0.0", "resolved": "https://registry.npmjs.org/js-tokens/-/js-tokens-4.0.0.tgz", @@ -1393,6 +1798,12 @@ "dev": true, "license": "MIT" }, + "node_modules/multiformats": { + "version": "9.9.0", + "resolved": "https://registry.npmjs.org/multiformats/-/multiformats-9.9.0.tgz", + "integrity": "sha512-HoMUjhH9T8DDBNT+6xzkrd9ga/XiBI4xLr58LJACwK6G3HTOPeMz4nB4KJs33L2BelrIJa7P0VuNaVF3hMYfjg==", + "license": "(Apache-2.0 AND MIT)" + }, "node_modules/nanoid": { "version": "3.3.11", "resolved": "https://registry.npmjs.org/nanoid/-/nanoid-3.3.11.tgz", @@ -1412,6 +1823,21 @@ "node": "^10 || ^12 || ^13.7 || ^14 || >=15.0.1" } }, + "node_modules/node-gyp-build-optional-packages": { + "version": "5.1.1", + "resolved": "https://registry.npmjs.org/node-gyp-build-optional-packages/-/node-gyp-build-optional-packages-5.1.1.tgz", + "integrity": "sha512-+P72GAjVAbTxjjwUmwjVrqrdZROD4nf8KgpBoDxqXXTiYZZt/ud60dE5yvCSr9lRO8e8yv6kgJIC0K0PfZFVQw==", + "license": "MIT", + "optional": true, + "dependencies": { + "detect-libc": "^2.0.1" + }, + "bin": { + "node-gyp-build-optional-packages": "bin.js", + "node-gyp-build-optional-packages-optional": "optional.js", + "node-gyp-build-optional-packages-test": "build-test.js" + } + }, "node_modules/npm-run-path": { "version": "6.0.0", "resolved": "https://registry.npmjs.org/npm-run-path/-/npm-run-path-6.0.0.tgz", @@ -1442,6 +1868,15 @@ "url": "https://github.com/sponsors/sindresorhus" } }, + "node_modules/on-exit-leak-free": { + "version": "2.1.2", + "resolved": "https://registry.npmjs.org/on-exit-leak-free/-/on-exit-leak-free-2.1.2.tgz", + "integrity": "sha512-0eJJY6hXLGf1udHwfNftBqH+g73EU4B504nZeKpz1sYRKafAghwxEJunB2O7rDZkL4PGfsMVnTXZ2EjibbqcsA==", + "license": "MIT", + "engines": { + "node": ">=14.0.0" + } + }, "node_modules/parent-module": { "version": "1.0.1", "resolved": "https://registry.npmjs.org/parent-module/-/parent-module-1.0.1.tgz", @@ -1518,6 +1953,44 @@ "integrity": "sha512-xceH2snhtb5M9liqDsmEw56le376mTZkEX/jEb/RxNFyegNul7eNslCXP9FDj/Lcu0X8KEyMceP2ntpaHrDEVA==", "license": "ISC" }, + "node_modules/pino": { + "version": "8.21.0", + "resolved": "https://registry.npmjs.org/pino/-/pino-8.21.0.tgz", + "integrity": "sha512-ip4qdzjkAyDDZklUaZkcRFb2iA118H9SgRh8yzTkSQK8HilsOJF7rSY8HoW5+I0M46AZgX/pxbprf2vvzQCE0Q==", + "license": "MIT", + "dependencies": { + "atomic-sleep": "^1.0.0", + "fast-redact": "^3.1.1", + "on-exit-leak-free": "^2.1.0", + "pino-abstract-transport": "^1.2.0", + "pino-std-serializers": "^6.0.0", + "process-warning": "^3.0.0", + "quick-format-unescaped": "^4.0.3", + "real-require": "^0.2.0", + "safe-stable-stringify": "^2.3.1", + "sonic-boom": "^3.7.0", + "thread-stream": "^2.6.0" + }, + "bin": { + "pino": "bin.js" + } + }, + "node_modules/pino-abstract-transport": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/pino-abstract-transport/-/pino-abstract-transport-1.2.0.tgz", + "integrity": "sha512-Guhh8EZfPCfH+PMXAb6rKOjGQEoy0xlAIn+irODG5kgfYV+BQ0rGYYWTIel3P5mmyXqkYkPmdIkywsn6QKUR1Q==", + "license": "MIT", + "dependencies": { + "readable-stream": "^4.0.0", + "split2": "^4.0.0" + } + }, + "node_modules/pino-std-serializers": { + "version": "6.2.2", + "resolved": "https://registry.npmjs.org/pino-std-serializers/-/pino-std-serializers-6.2.2.tgz", + "integrity": "sha512-cHjPPsE+vhj/tnhCy/wiMh3M3z3h/j15zHQX+S9GkTBgqJuTuJzYJ4gUyACLhDaJ7kk9ba9iRDmbH2tJU03OiA==", + "license": "MIT" + }, "node_modules/postcss": { "version": "8.5.6", "resolved": "https://registry.npmjs.org/postcss/-/postcss-8.5.6.tgz", @@ -1563,6 +2036,52 @@ "url": "https://github.com/sponsors/sindresorhus" } }, + "node_modules/process": { + "version": "0.11.10", + "resolved": "https://registry.npmjs.org/process/-/process-0.11.10.tgz", + "integrity": "sha512-cdGef/drWFoydD1JsMzuFf8100nZl+GT+yacc2bEced5f9Rjk4z+WtFUTBu9PhOi9j/jfmBPu0mMEY4wIdAF8A==", + "license": "MIT", + "engines": { + "node": ">= 0.6.0" + } + }, + "node_modules/process-warning": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/process-warning/-/process-warning-3.0.0.tgz", + "integrity": "sha512-mqn0kFRl0EoqhnL0GQ0veqFHyIN1yig9RHh/InzORTUiZHFRAur+aMtRkELNwGs9aNwKS6tg/An4NYBPGwvtzQ==", + "license": "MIT" + }, + "node_modules/quick-format-unescaped": { + "version": "4.0.4", + "resolved": "https://registry.npmjs.org/quick-format-unescaped/-/quick-format-unescaped-4.0.4.tgz", + "integrity": "sha512-tYC1Q1hgyRuHgloV/YXs2w15unPVh8qfu/qCTfhTYamaw7fyhumKa2yGpdSo87vY32rIclj+4fWYQXUMs9EHvg==", + "license": "MIT" + }, + "node_modules/readable-stream": { + "version": "4.7.0", + "resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-4.7.0.tgz", + "integrity": "sha512-oIGGmcpTLwPga8Bn6/Z75SVaH1z5dUut2ibSyAMVhmUggWpmDn2dapB0n7f8nwaSiRtepAsfJyfXIO5DCVAODg==", + "license": "MIT", + "dependencies": { + "abort-controller": "^3.0.0", + "buffer": "^6.0.3", + "events": "^3.3.0", + "process": "^0.11.10", + "string_decoder": "^1.3.0" + }, + "engines": { + "node": "^12.22.0 || ^14.17.0 || >=16.0.0" + } + }, + "node_modules/real-require": { + "version": "0.2.0", + "resolved": "https://registry.npmjs.org/real-require/-/real-require-0.2.0.tgz", + "integrity": "sha512-57frrGM/OCTLqLOAh0mhVA9VBMHd+9U7Zb2THMGdBUoZVOtGbJzjxsYGDJ3A9AYYCP4hn6y1TVbaOfzWtm5GFg==", + "license": "MIT", + "engines": { + "node": ">= 12.13.0" + } + }, "node_modules/resolve-from": { "version": "4.0.0", "resolved": "https://registry.npmjs.org/resolve-from/-/resolve-from-4.0.0.tgz", @@ -1624,6 +2143,35 @@ "fsevents": "~2.3.2" } }, + "node_modules/safe-buffer": { + "version": "5.2.1", + "resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.2.1.tgz", + "integrity": "sha512-rp3So07KcdmmKbGvgaNxQSJr7bGVSVk5S9Eq1F+ppbRo70+YeaDxkw5Dd8NPN+GD6bjnYm2VuPuCXmpuYvmCXQ==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/feross" + }, + { + "type": "patreon", + "url": "https://www.patreon.com/feross" + }, + { + "type": "consulting", + "url": "https://feross.org/support" + } + ], + "license": "MIT" + }, + "node_modules/safe-stable-stringify": { + "version": "2.5.0", + "resolved": "https://registry.npmjs.org/safe-stable-stringify/-/safe-stable-stringify-2.5.0.tgz", + "integrity": "sha512-b3rppTKm9T+PsVCBEOUR46GWI7fdOs00VKZ1+9c1EWDaDMvjQc6tUwuFyIprgGgTcWoVHSKrU8H31ZHA2e0RHA==", + "license": "MIT", + "engines": { + "node": ">=10" + } + }, "node_modules/shebang-command": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/shebang-command/-/shebang-command-2.0.0.tgz", @@ -1667,6 +2215,15 @@ "url": "https://github.com/sponsors/isaacs" } }, + "node_modules/sonic-boom": { + "version": "3.8.1", + "resolved": "https://registry.npmjs.org/sonic-boom/-/sonic-boom-3.8.1.tgz", + "integrity": "sha512-y4Z8LCDBuum+PBP3lSV7RHrXscqksve/bi0as7mhwVnBW+/wUqKT/2Kb7um8yqcFy0duYbbPxzt89Zy2nOCaxg==", + "license": "MIT", + "dependencies": { + "atomic-sleep": "^1.0.0" + } + }, "node_modules/source-map-js": { "version": "1.2.1", "resolved": "https://registry.npmjs.org/source-map-js/-/source-map-js-1.2.1.tgz", @@ -1677,6 +2234,15 @@ "node": ">=0.10.0" } }, + "node_modules/split2": { + "version": "4.2.0", + "resolved": "https://registry.npmjs.org/split2/-/split2-4.2.0.tgz", + "integrity": "sha512-UcjcJOWknrNkF6PLX83qcHM6KHgVKNkV62Y8a5uYDVv9ydGQVwAHMKqHdJje1VTWpljG0WYpCDhrCdAOYH4TWg==", + "license": "ISC", + "engines": { + "node": ">= 10.x" + } + }, "node_modules/stackback": { "version": "0.0.2", "resolved": "https://registry.npmjs.org/stackback/-/stackback-0.0.2.tgz", @@ -1691,6 +2257,15 @@ "dev": true, "license": "MIT" }, + "node_modules/string_decoder": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-1.3.0.tgz", + "integrity": "sha512-hkRX8U1WjJFd8LsDJ2yQ/wWWxaopEsABU1XfkM8A+j0+85JAGppt16cr1Whg6KIbb4okU6Mql6BOj+uup/wKeA==", + "license": "MIT", + "dependencies": { + "safe-buffer": "~5.2.0" + } + }, "node_modules/strip-final-newline": { "version": "4.0.0", "resolved": "https://registry.npmjs.org/strip-final-newline/-/strip-final-newline-4.0.0.tgz", @@ -1704,6 +2279,15 @@ "url": "https://github.com/sponsors/sindresorhus" } }, + "node_modules/thread-stream": { + "version": "2.7.0", + "resolved": "https://registry.npmjs.org/thread-stream/-/thread-stream-2.7.0.tgz", + "integrity": "sha512-qQiRWsU/wvNolI6tbbCKd9iKaTnCXsTwVxhhKM6nctPdujTyztjlbUkUTUymidWcMnZ5pWR0ej4a0tjsW021vw==", + "license": "MIT", + "dependencies": { + "real-require": "^0.2.0" + } + }, "node_modules/tinybench": { "version": "2.9.0", "resolved": "https://registry.npmjs.org/tinybench/-/tinybench-2.9.0.tgz", @@ -1748,6 +2332,15 @@ "node": ">=14.0.0" } }, + "node_modules/tlds": { + "version": "1.261.0", + "resolved": "https://registry.npmjs.org/tlds/-/tlds-1.261.0.tgz", + "integrity": "sha512-QXqwfEl9ddlGBaRFXIvNKK6OhipSiLXuRuLJX5DErz0o0Q0rYxulWLdFryTkV5PkdZct5iMInwYEGe/eR++1AA==", + "license": "MIT", + "bin": { + "tlds": "bin.js" + } + }, "node_modules/tsx": { "version": "4.20.6", "resolved": "https://registry.npmjs.org/tsx/-/tsx-4.20.6.tgz", @@ -1782,6 +2375,15 @@ "node": ">=14.17" } }, + "node_modules/uint8arrays": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/uint8arrays/-/uint8arrays-3.0.0.tgz", + "integrity": "sha512-HRCx0q6O9Bfbp+HHSfQQKD7wU70+lydKVt4EghkdOvlK/NlrF90z+eXV34mUd48rNvVJXwkrMSPpCATkct8fJA==", + "license": "MIT", + "dependencies": { + "multiformats": "^9.4.2" + } + }, "node_modules/undici-types": { "version": "6.21.0", "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-6.21.0.tgz", @@ -1802,6 +2404,12 @@ "url": "https://github.com/sponsors/sindresorhus" } }, + "node_modules/varint": { + "version": "6.0.0", + "resolved": "https://registry.npmjs.org/varint/-/varint-6.0.0.tgz", + "integrity": "sha512-cXEIW6cfr15lFv563k4GuVuW/fiwjknytD37jIOLSdSWuOI6WnO/oKwmP2FQTU2l01LP8/M5TSAJpzUaGe3uWg==", + "license": "MIT" + }, "node_modules/vite": { "version": "5.4.20", "resolved": "https://registry.npmjs.org/vite/-/vite-5.4.20.tgz", @@ -2426,6 +3034,15 @@ "funding": { "url": "https://github.com/sponsors/sindresorhus" } + }, + "node_modules/zod": { + "version": "3.25.76", + "resolved": "https://registry.npmjs.org/zod/-/zod-3.25.76.tgz", + "integrity": "sha512-gzUt/qt81nXsFGKIFcC3YnfEAx5NkunCfnDlvuBSSFS02bcXu4Lmea0AFIUwbLWxWPx3d9p8S5QoaujKcNQxcQ==", + "license": "MIT", + "funding": { + "url": "https://github.com/sponsors/colinhacks" + } } } } diff --git a/package.json b/package.json index 06fb080..91f1e51 100644 --- a/package.json +++ b/package.json @@ -54,6 +54,8 @@ "node": ">=18" }, "dependencies": { + "@atproto/api": "^0.18.0", + "@atproto/repo": "^0.8.10", "cosmiconfig": "^9.0.0" }, "devDependencies": { diff --git a/src/cli/splice.ts b/src/cli/splice.ts index 0c5bb4c..d4565f0 100644 --- a/src/cli/splice.ts +++ b/src/cli/splice.ts @@ -13,6 +13,7 @@ import { fileURLToPath } from "node:url"; import { CLIOptions, parseArgs, makeLogger, usage } from "../core/types"; import { detectTwitterArchive, ingestTwitter } from "../sources/twitter"; +import { detectBlueskyCar, ingestBlueskyCar, enrichBlueskyPosts } from "../sources/bluesky"; import { applyFilters, indexById, @@ -118,6 +119,7 @@ async function main() { "--status", "--ids", "--ids-file", + "--enrich", "--", ]); const unknown = argv.filter( @@ -164,18 +166,47 @@ async function main() { opts.workspace || path.join(outDir, ".splice"), ); - const detected = await detectTwitterArchive(source); - if (!detected) { + const adapters = [ + { + kind: "twitter", + detect: detectTwitterArchive, + ingest: ingestTwitter, + }, + { + kind: "bluesky", + detect: detectBlueskyCar, + ingest: ingestBlueskyCar, + }, + ] as const; + + let selected: + | (typeof adapters)[number] + | null = null; + for (const adapter of adapters) { + // eslint-disable-next-line no-await-in-loop + const matches = await adapter.detect(source); + if (matches) { + selected = adapter; + break; + } + } + if (!selected) { logger( "error", - `Could not detect a Twitter archive at ${source} (missing data/manifest.js)`, + `Could not detect a supported archive at ${source} (expected Twitter directory or Bluesky .car file)`, ); process.exit(2); } + logger("info", `Detected source: ${selected.kind}`); try { - logger("info", `Ingesting from ${source}`); - const items = await ingestTwitter(source, logger); + logger("info", `Ingesting ${selected.kind} data from ${source}`); + let items = await selected.ingest(source, logger); + + // Enrich Bluesky posts with parent context if requested + if (opts.enrich && selected.kind === "bluesky") { + items = await enrichBlueskyPosts(items, logger); + } const filtered = applyFilters(items, { since: opts.since, @@ -337,7 +368,15 @@ async function main() { const manifest = createCheckpointManifest({ parentId: (latest && latest.id) || null, itemsRef: itemsRefAll, - sourceRefs: [{ kind: "twitter", uri: source }], + sourceRefs: [ + { + kind: selected.kind, + uri: + selected.kind === "bluesky" + ? items[0]?.accountId ?? source + : source, + }, + ], transforms, decisionsRef, materialized: { threadsRef, conversationsRef }, diff --git a/src/core/types.ts b/src/core/types.ts index 90aad3f..c271193 100644 --- a/src/core/types.ts +++ b/src/core/types.ts @@ -84,13 +84,15 @@ export type CLIOptions = { idsFile?: string; // outputs statsJson: boolean; + // bluesky enrichment + enrich: boolean; }; export const DEFAULT_SYSTEM_MESSAGE = "You have been uploaded to the internet"; export function parseArgs(argv: string[]): CLIOptions { const opts: CLIOptions = { - format: ["markdown", "oai"], + format: ["markdown", "oai", "json"], systemMessage: DEFAULT_SYSTEM_MESSAGE, dryRun: false, logLevel: "info", @@ -111,6 +113,8 @@ export function parseArgs(argv: string[]): CLIOptions { statsJson: false, workspace: undefined, checkpoint: undefined, + // bluesky enrichment + enrich: false, }; const args = argv.slice(2); @@ -176,6 +180,8 @@ export function parseArgs(argv: string[]): CLIOptions { opts.withMedia = true; } else if (a === "--stats-json") { opts.statsJson = true; + } else if (a === "--enrich") { + opts.enrich = true; } else if (a === "--decisions-import" || a === "--decisions-file") { opts.decisionsImport = args[++i]; } else if (a === "--set-status" || a === "--status") { diff --git a/src/index.ts b/src/index.ts index 132eed5..bc255f8 100644 --- a/src/index.ts +++ b/src/index.ts @@ -18,6 +18,7 @@ export * from "./core/types"; // Re-export built-in Source(s) export * from "./sources/twitter"; +export * from "./sources/bluesky"; // Re-export built-in Transforms export * from "./transforms/core"; diff --git a/src/outputs/writers.ts b/src/outputs/writers.ts index 1b41376..d88e8f2 100644 --- a/src/outputs/writers.ts +++ b/src/outputs/writers.ts @@ -1,9 +1,11 @@ import * as fs from "node:fs/promises"; import * as path from "node:path"; +import { AtUri } from "@atproto/api"; import { ContentItem, Thread, Level, + MediaAttachment, formatIsoDateOnly, sanitizeFilename, isRetweet, @@ -24,7 +26,7 @@ async function ensureDir(p: string) { async function copyMedia( items: ContentItem[], imagesDir: string, - logger: (l: Level, m: string) => void, + logger: (l: Level, m: string) => void ) { await ensureDir(imagesDir); for (const it of items) { @@ -32,20 +34,62 @@ async function copyMedia( const base = m.absPath ? path.basename(m.absPath) : `${m.id}.bin`; try { if (!m.absPath) { - logger("warn", `No absPath for media ${m.id}; skipping copy`); + logger("debug", `No absPath for media ${m.id}; skipping copy`); continue; } await fs.copyFile(m.absPath, path.join(imagesDir, `_${base}`)); } catch (e) { logger( "warn", - `Failed to copy media ${m.absPath ?? m.id}: ${(e as Error).message}`, + `Failed to copy media ${m.absPath ?? m.id}: ${(e as Error).message}` ); } } } } +const SELF_POST_SOURCES = new Set(["twitter:tweet", "bluesky:post"]); + +function isSelfAuthoredPost(item: ContentItem): boolean { + return SELF_POST_SOURCES.has(item.source); +} + +function isReshare(item: ContentItem): boolean { + return item.source === "twitter:tweet" && isRetweet(item.text); +} + +function mediaMarkdownLinks(media?: MediaAttachment[]): string[] { + if (!media) return []; + return media + .filter((m) => !!m.absPath) + .map((m) => { + const base = path.basename(m.absPath as string); + return `![${base}](../../images/_${base})`; + }); +} + +function buildPermalink(item: ContentItem): { url: string; label: string } | null { + if (item.source === "twitter:tweet") { + return { + url: `https://twitter.com/i/web/status/${item.id}`, + label: "Twitter", + }; + } + if (item.source === "bluesky:post") { + try { + const uri = new AtUri(item.id); + if (!uri.host || !uri.rkey) return null; + return { + url: `https://bsky.app/profile/${uri.host}/post/${uri.rkey}`, + label: "Bluesky", + }; + } catch { + return null; + } + } + return null; +} + /** * Ensure quoted tweet links render as separate paragraphs. * Surround twitter.com or x.com status URLs with blank lines, without stripping intentional spacing. @@ -83,8 +127,8 @@ function isolateQuotedTweetLinks(text: string): string { /** * Write Markdown outputs: - * - threads/<yyyymmdd>-thread-<slug>.md with frontmatter, cleaned text, media links, and link to Twitter - * - tweets/<yyyymmdd>-tweet-<slug>.md for non-thread tweets (excluding RTs) + * - threads/<yyyymmdd>-thread-<slug>.md with frontmatter, cleaned text, media links, and link to the source platform + * - tweets/<yyyymmdd>-tweet-<slug>.md for non-thread posts (excluding reshares) * - images/_<file> copied for referenced items */ export async function writeMarkdown( @@ -92,7 +136,7 @@ export async function writeMarkdown( items: ContentItem[], outDir: string, logger: (l: Level, m: string) => void, - dryRun: boolean, + dryRun: boolean ) { const threadsDir = path.join(outDir, "threads"); const tweetsDir = path.join(outDir, "tweets"); @@ -108,13 +152,10 @@ export async function writeMarkdown( const realThreads = threads.filter((t) => t.items.length > 1); const threadItems = realThreads.flatMap((t) => t.items); const threadIds = new Set(threadItems.map((i) => i.id)); - const nonThreadTweets = items.filter( - (i) => - i.source === "twitter:tweet" && - !threadIds.has(i.id) && - !isRetweet(i.text), + const nonThreadPosts = items.filter( + (i) => isSelfAuthoredPost(i) && !threadIds.has(i.id) && !isReshare(i) ); - const copyPool = threadItems.concat(nonThreadTweets); + const copyPool = threadItems.concat(nonThreadPosts); logger("info", `Preparing media for ${copyPool.length} items`); if (!dryRun) await copyMedia(copyPool, imagesDir, logger); @@ -128,21 +169,25 @@ export async function writeMarkdown( const parts: string[] = []; for (const t of thread.items) { - const mediaLinks = (t.media ?? []).map((m) => { - const base = m.absPath ? path.basename(m.absPath) : `${m.id}.bin`; - return `![${base}](../../images/_${base})`; - }); + const mediaLinks = mediaMarkdownLinks(t.media); const cleaned = cleanText(t.text, (t.raw as any)?.entities); const prepared = isolateQuotedTweetLinks(cleaned); - parts.push(`${prepared}\n\n${mediaLinks.join("\n")}`.trim()); + const segments = [prepared]; + if (mediaLinks.length) { + segments.push(mediaLinks.join("\n")); + } + parts.push(segments.filter(Boolean).join("\n\n").trim()); } const firstWords = thread.items[0].text.split(/\s+/).slice(0, 5).join(" "); const name = sanitizeFilename(firstWords) || thread.id; const ymd = date.replace(/-/g, ""); const filePath = path.join(threadsDir, `${ymd}/${name}.md`); - const topLink = `https://twitter.com/i/web/status/${first.id}`; - const body = `${fm}\n${parts.join("\n\n")}\n\n[View on Twitter](${topLink})`; + const permalink = buildPermalink(first); + const footer = permalink + ? `\n\n[View on ${permalink.label}](${permalink.url})` + : ""; + const body = `${fm}\n${parts.join("\n\n")}${footer}`; if (dryRun) { logger("info", `(dry-run) would write thread file: ${filePath}`); @@ -152,25 +197,22 @@ export async function writeMarkdown( } } - // Save non-thread tweets by date - // Save single tweets (non-RTs not part of multi-tweet threads) as individual files in tweets/ - for (const t of nonThreadTweets) { + // Save single posts (non-reshares not part of multi-item threads) as individual files in tweets/ + for (const t of nonThreadPosts) { const date = formatIsoDateOnly(t.createdAt); const ymd = date.replace(/-/g, ""); const fm = `---\nDate: ${date}\n---\n`; - const images = (t.media ?? []) - .map((m) => { - const base = m.absPath ? path.basename(m.absPath) : `${m.id}.bin`; - return `![${base}](../../images/_${base})`; - }) - .join("\n"); + const images = mediaMarkdownLinks(t.media).join("\n"); const cleaned = cleanText(t.text, (t.raw as any)?.entities); const prepared = isolateQuotedTweetLinks(cleaned); const withImages = images ? `${prepared}\n\n${images}` : prepared; const words = t.text.split(/\s+/).slice(0, 5).join(" "); const slug = sanitizeFilename(words) || t.id; - const topLink = `https://twitter.com/i/web/status/${t.id}`; - const content = `${fm}\n${withImages}\n\n[View on Twitter](${topLink})`; + const permalink = buildPermalink(t); + const footer = permalink + ? `\n\n[View on ${permalink.label}](${permalink.url})` + : ""; + const content = `${fm}\n${withImages}${footer}`; const filePath = path.join(tweetsDir, `${ymd}/${slug}.md`); if (dryRun) { logger("info", `(dry-run) would write tweet file: ${filePath}`); @@ -191,7 +233,7 @@ export async function writeOAI( outDir: string, systemMessage: string, logger: (l: Level, m: string) => void, - dryRun: boolean, + dryRun: boolean ) { const outPath = path.join(outDir, "conversations_oai.jsonl"); if (dryRun) { @@ -223,7 +265,7 @@ export async function writeNormalizedJSONL( items: ContentItem[], outDir: string, logger: (l: Level, m: string) => void, - dryRun: boolean, + dryRun: boolean ) { const outPath = path.join(outDir, "normalized_items.jsonl"); if (dryRun) { @@ -247,7 +289,7 @@ export async function writeShareGPT( conversations: ContentItem[][], outDir: string, logger: (l: Level, m: string) => void, - dryRun: boolean, + dryRun: boolean ) { const outPath = path.join(outDir, "sharegpt.json"); if (dryRun) { @@ -282,7 +324,7 @@ export async function writeStatsJSON( conversations: ContentItem[][], outDir: string, logger: (l: Level, m: string) => void, - dryRun: boolean, + dryRun: boolean ) { const outPath = path.join(outDir, "stats.json"); const dates = items diff --git a/src/sources/bluesky.ts b/src/sources/bluesky.ts new file mode 100644 index 0000000..3d53c58 --- /dev/null +++ b/src/sources/bluesky.ts @@ -0,0 +1,320 @@ +import * as fs from "node:fs/promises"; +import * as path from "node:path"; +import { readCarWithRoot, MemoryBlockstore, Repo } from "@atproto/repo"; +import { AtUri } from "@atproto/api"; +import type { AppBskyFeedPost } from "@atproto/api"; +import { + ContentItem, + Level, + MediaAttachment, + toIso, +} from "../core/types"; + +const POST_COLLECTION = "app.bsky.feed.post"; + +/** + * Detect whether the provided path looks like a Bluesky/AT Protocol CAR export. + * We keep this lightweight and only check for a readable .car file. + */ +export async function detectBlueskyCar(targetPath: string): Promise { + try { + const stat = await fs.stat(targetPath); + if (!stat.isFile()) return false; + return path.extname(targetPath).toLowerCase() === ".car"; + } catch { + return false; + } +} + +/** + * Ingest a Bluesky repository export (CAR file) and normalize posts into ContentItems. + * Media blobs are referenced but not downloaded; attachments carry blob metadata only. + */ +export async function ingestBlueskyCar( + carPath: string, + logger: (l: Level, m: string) => void, +): Promise { + const absolute = path.resolve(carPath); + logger("info", `Reading Bluesky CAR from ${absolute}`); + const bytes = await fs.readFile(absolute); + const { root, blocks } = await readCarWithRoot(bytes); + const blockstore = new MemoryBlockstore(blocks); + const repo = await Repo.load(blockstore, root); + + logger("info", `Repo DID: ${repo.did}`); + + const items: ContentItem[] = []; + for await (const recordEntry of repo.walkRecords()) { + if (recordEntry.collection !== POST_COLLECTION) continue; + const post = recordEntry.record as AppBskyFeedPost.Record | undefined; + if (!post) continue; + + const uri = formatRecordUri(repo.did, recordEntry.collection, recordEntry.rkey); + const parentUri = extractParentUri(post); + const parentDid = parentUri ? extractDid(parentUri) : null; + + items.push({ + id: uri, + text: post.text ?? "", + createdAt: post.createdAt ? toIso(post.createdAt) : new Date().toISOString(), + parentId: parentUri, + inReplyToUserId: parentDid, + accountId: repo.did, + source: "bluesky:post", + raw: { + uri, + cid: recordEntry.cid.toString(), + collection: recordEntry.collection, + rkey: recordEntry.rkey, + record: cloneRecord(post), + }, + media: extractMediaAttachments(uri, post), + }); + } + + logger("info", `Total normalized Bluesky posts: ${items.length}`); + return items; +} + +function formatRecordUri(did: string, collection: string, rkey: string): string { + const atUri = AtUri.make(`at://${did}`, collection, rkey); + return atUri.toString(); +} + +function extractParentUri( + record: AppBskyFeedPost.Record, +): string | null { + const reply = record.reply; + if (reply?.parent?.uri) { + return reply.parent.uri; + } + return null; +} + +function extractDid(uri: string): string | null { + try { + const parsed = new AtUri(uri); + return parsed.host || null; + } catch { + return null; + } +} + +function cloneRecord(record: AppBskyFeedPost.Record): Record { + return JSON.parse(JSON.stringify(record)); +} + +function extractMediaAttachments( + uri: string, + record: AppBskyFeedPost.Record, +): MediaAttachment[] { + const embed = record.embed as Record | undefined; + if (!embed) return []; + + if (embed.$type === "app.bsky.embed.images") { + return (embed.images ?? []).map((image: any, idx: number) => + makeBlobAttachment( + uri, + `image-${idx}`, + image?.image, + "photo", + { alt: image?.alt, aspectRatio: image?.aspectRatio }, + ), + ); + } + + if (embed.$type === "app.bsky.embed.video") { + return [ + makeBlobAttachment(uri, "video", embed.video, "video", { + alt: embed.alt, + aspectRatio: embed.aspectRatio, + }), + ].filter(Boolean) as MediaAttachment[]; + } + + if (embed.$type === "app.bsky.embed.recordWithMedia" && embed.media) { + return extractMediaAttachments(uri, { ...record, embed: embed.media }); + } + + return []; +} + +function makeBlobAttachment( + uri: string, + suffix: string, + blob: any, + contentType: MediaAttachment["contentType"], + metadata: Record, +): MediaAttachment | null { + if (!blob) return null; + const cid = + typeof blob.cid === "string" + ? blob.cid + : blob.ref?.toString?.() ?? `${uri}#${suffix}`; + return { + id: `${uri}#${suffix}`, + contentType, + metadata: { + ...metadata, + cid, + mimeType: blob.mimeType, + size: blob.size, + }, + }; +} + +const BSKY_PUBLIC_API = "https://public.api.bsky.app"; + +interface ThreadPost { + $type?: string; + post?: { + uri: string; + cid: string; + author: { did: string; handle: string; displayName?: string }; + record: { text?: string; createdAt?: string; reply?: any }; + }; + parent?: ThreadPost; +} + +/** + * Extract a ContentItem from a thread post object. + */ +function threadPostToItem(tp: ThreadPost, parentUri: string | null): ContentItem | null { + if (!tp.post) return null; + const post = tp.post; + const record = post.record; + const author = post.author; + + return { + id: post.uri, + text: record.text ?? "", + createdAt: record.createdAt ? toIso(record.createdAt) : new Date().toISOString(), + parentId: parentUri, + inReplyToUserId: parentUri ? null : null, // We don't track this for fetched posts + accountId: author.did, + source: "bluesky:fetched", + raw: { + uri: post.uri, + cid: post.cid, + author: { + did: author.did, + handle: author.handle, + displayName: author.displayName, + }, + record, + }, + media: [], + }; +} + +/** + * Fetch a post and its full parent chain from the public Bluesky API. + * Returns all posts in the chain (newest first), or empty array if not found. + */ +async function fetchPostChain( + uri: string, + logger: (l: Level, m: string) => void, +): Promise { + const url = `${BSKY_PUBLIC_API}/xrpc/app.bsky.feed.getPostThread?uri=${encodeURIComponent(uri)}&depth=0&parentHeight=50`; + try { + const res = await fetch(url); + if (!res.ok) { + if (res.status === 404 || res.status === 400) { + logger("debug", `Post not found: ${uri}`); + return []; + } + throw new Error(`API error ${res.status}`); + } + const data = await res.json() as { thread?: ThreadPost }; + const thread = data.thread; + if (!thread || thread.$type !== "app.bsky.feed.defs#threadViewPost") { + return []; + } + + // Walk up the parent chain and collect all posts + const posts: ContentItem[] = []; + let current: ThreadPost | undefined = thread; + let childUri: string | null = null; + + while (current && current.$type === "app.bsky.feed.defs#threadViewPost" && current.post) { + // Determine parent URI for this post + const parentUri = current.parent?.post?.uri ?? null; + const item = threadPostToItem(current, parentUri); + if (item) { + posts.push(item); + } + current = current.parent; + } + + return posts; // newest first (the requested post, then its parent, grandparent, etc.) + } catch (e) { + logger("warn", `Failed to fetch ${uri}: ${(e as Error).message}`); + return []; + } +} + +/** + * Enrich a list of Bluesky posts by fetching their parent posts from the public API. + * Returns the original items plus any successfully fetched parent posts. + */ +export async function enrichBlueskyPosts( + items: ContentItem[], + logger: (l: Level, m: string) => void, +): Promise { + // Collect unique parent URIs that we don't already have + const existingIds = new Set(items.map(i => i.id)); + const parentUris = new Set(); + + for (const item of items) { + if (item.parentId && !existingIds.has(item.parentId)) { + parentUris.add(item.parentId); + } + } + + if (parentUris.size === 0) { + logger("info", "No parent posts to fetch"); + return items; + } + + logger("info", `Fetching thread context for ${parentUris.size} parent posts from Bluesky API...`); + + const fetched: ContentItem[] = []; + const fetchedIds = new Set(); + const uriList = Array.from(parentUris); + let completed = 0; + + // Batch with rate limiting - 10 concurrent, 100ms delay between batches + const BATCH_SIZE = 10; + const DELAY_MS = 100; + + for (let i = 0; i < uriList.length; i += BATCH_SIZE) { + const batch = uriList.slice(i, i + BATCH_SIZE); + const results = await Promise.all( + batch.map(uri => fetchPostChain(uri, logger)) + ); + + // Flatten and dedupe (same parent may appear in multiple chains) + for (const chain of results) { + for (const item of chain) { + if (!existingIds.has(item.id) && !fetchedIds.has(item.id)) { + fetched.push(item); + fetchedIds.add(item.id); + } + } + } + + completed += batch.length; + if (completed % 500 === 0 || completed === uriList.length) { + logger("info", `Fetched ${completed}/${uriList.length} thread contexts (${fetched.length} unique posts)`); + } + + // Rate limit delay + if (i + BATCH_SIZE < uriList.length) { + await new Promise(r => setTimeout(r, DELAY_MS)); + } + } + + logger("info", `Enrichment complete: fetched ${fetched.length} unique context posts`); + + return [...items, ...fetched]; +} diff --git a/src/transforms/core.ts b/src/transforms/core.ts index 29a2ff7..cdccda3 100644 --- a/src/transforms/core.ts +++ b/src/transforms/core.ts @@ -6,6 +6,8 @@ import { isRetweet, } from "../core/types"; +const SELF_POST_SOURCES = new Set(["twitter:tweet", "bluesky:post"]); + /** * Replace shortened URLs with expanded; strip t.co links, mentions, hashtags. * Preserve paragraph breaks; collapse intra-line spaces and trim. @@ -22,9 +24,9 @@ export function cleanText( } // Normalize line endings t = t.replace(/\r\n?/g, "\n"); - // Remove t.co links, mentions, and hashtags + // Remove t.co links, mentions (including Bluesky domain-style), and hashtags t = t.replace(/https:\/\/t\.co\/\w+/g, ""); - t = t.replace(/@\w+/g, ""); + t = t.replace(/@[\w.-]+/g, ""); // Matches @user, @user.bsky.social, @berduck.deepfates.com t = t.replace(/#\w+/g, ""); // Collapse spaces/tabs within lines while preserving paragraph breaks t = t @@ -57,6 +59,9 @@ export function applyFilters( const untilTime = opts.until ? new Date(opts.until).getTime() : Infinity; return items.filter((it) => { + // Always preserve fetched parent posts (needed for conversation context) + if (it.source === "bluesky:fetched") return true; + const t = new Date(it.createdAt).getTime(); if (!(t >= sinceTime && t <= untilTime)) return false; if (opts.excludeRt && isRetweet(it.text)) return false; @@ -97,6 +102,8 @@ export function groupThreadsAndConversations( const items = Object.values(all); for (const item of items) { if (processed.has(item.id)) continue; + // Don't start chains from fetched posts - they're context for other posts + if (item.source === "bluesky:fetched") continue; const chain: ContentItem[] = [item]; let current = item; @@ -108,7 +115,9 @@ export function groupThreadsAndConversations( } for (const c of chain) processed.add(c.id); - const allTweets = chain.every((c) => c.source === "twitter:tweet"); + const allSelfPosts = chain.every((c) => + SELF_POST_SOURCES.has(c.source), + ); // Check if this is a self-thread (all tweets are self-replies) // A tweet is a self-reply if: @@ -130,7 +139,7 @@ export function groupThreadsAndConversations( return true; }); - if (allTweets && isSelfThread) { + if (allSelfPosts && isSelfThread) { const ordered = chain.slice().reverse(); // oldest → newest threads.push({ id: ordered[0].id, items: ordered }); } else { @@ -149,8 +158,13 @@ export function groupThreadsAndConversations( * - Trim trailing user messages to end on assistant if possible. */ export function inferRole(it: ContentItem): Role { - // Heuristic: tweets that look like assistant outputs (e.g., have full_text) are "assistant"; others are "user" - return it.raw && "full_text" in (it.raw as any) ? "assistant" : "user"; + // Twitter: has full_text in raw (from archive owner's tweets) + if (it.raw && "full_text" in (it.raw as any)) return "assistant"; + + // Bluesky: the archive only contains our posts, so all bluesky:post items are "assistant" + if (it.source === "bluesky:post") return "assistant"; + + return "user"; } export function messagesFromConversation(items: ContentItem[]): ChatMessage[] { diff --git a/tests/integration/basic.test.ts b/tests/integration/basic.test.ts index 4b40614..14f5aec 100644 --- a/tests/integration/basic.test.ts +++ b/tests/integration/basic.test.ts @@ -243,13 +243,20 @@ describe("splice CLI integration", () => { // Check that we have exactly one thread (root + self-reply only) const threadsDir = path.join(tempOut, "threads"); - const threadFiles = await fs.readdir(threadsDir); + const threadSubdirs = ( + await fs.readdir(threadsDir, { withFileTypes: true }) + ) + .filter((d) => d.isDirectory()) + .map((d) => d.name); + expect(threadSubdirs.length).toBe(1); + const threadYmdDir = path.join(threadsDir, threadSubdirs[0]); + const threadFiles = await fs.readdir(threadYmdDir); const mdFiles = threadFiles.filter((f) => f.endsWith(".md")); expect(mdFiles.length).toBe(1); // Read the thread content const threadContent = await fs.readFile( - path.join(threadsDir, mdFiles[0]), + path.join(threadYmdDir, mdFiles[0]), "utf8", ); diff --git a/tests/integration/media.test.ts b/tests/integration/media.test.ts index 8fc5f13..120ba47 100644 --- a/tests/integration/media.test.ts +++ b/tests/integration/media.test.ts @@ -89,17 +89,24 @@ describe("splice CLI media handling", () => { expect(copiedStat.size).toBeGreaterThan(0); // Find the generated thread markdown and assert the image link is present - const threadFiles = await fs.readdir(threadsDir); + const threadSubdirs = ( + await fs.readdir(threadsDir, { withFileTypes: true }) + ) + .filter((d) => d.isDirectory()) + .map((d) => d.name); + expect(threadSubdirs.length).toBeGreaterThan(0); + const threadYmdDir = path.join(threadsDir, threadSubdirs[0]); + const threadFiles = await fs.readdir(threadYmdDir); const mdFiles = threadFiles.filter((f) => f.endsWith(".md")); expect(mdFiles.length).toBeGreaterThan(0); // Read first thread file - const threadMdPath = path.join(threadsDir, mdFiles[0]); + const threadMdPath = path.join(threadYmdDir, mdFiles[0]); const threadContent = await fs.readFile(threadMdPath, "utf8"); - // Threads are saved under out/threads, so images are linked as ../images/_ + // Threads are saved under out/threads/, so images are linked as ../../images/_ const expectedMdImage = - `![${sourceMediaBasename}](../images/${copiedMediaBasename})`; + `![${sourceMediaBasename}](../../images/${copiedMediaBasename})`; expect(threadContent).toContain(expectedMdImage); },