From 69f10d03f970e58d5c683f83a826751c7365a3ef Mon Sep 17 00:00:00 2001 From: deepfates Date: Sat, 18 Oct 2025 15:54:25 -0700 Subject: [PATCH 1/4] refactor: modular pipeline layout (sources/transforms/outputs/core/cli); preserve CLI behavior; add tsconfig; update build/bin --- package-lock.json | 189 ++++++++++++++++++++++++- src/cli/splice.ts | 268 +++++++++++++++++++++++++++++++++++ src/core/types.ts | 312 +++++++++++++++++++++++++++++++++++++++++ src/outputs/writers.ts | 279 ++++++++++++++++++++++++++++++++++++ src/sources/twitter.ts | 140 ++++++++++++++++++ src/transforms/core.ts | 136 ++++++++++++++++++ tsconfig.json | 30 ++++ 7 files changed, 1347 insertions(+), 7 deletions(-) create mode 100644 src/cli/splice.ts create mode 100644 src/core/types.ts create mode 100644 src/outputs/writers.ts create mode 100644 src/sources/twitter.ts create mode 100644 src/transforms/core.ts create mode 100644 tsconfig.json diff --git a/package-lock.json b/package-lock.json index 2139d6a..e9b1e8b 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,15 +1,18 @@ { - "name": "splice", - "version": "0.1.0", + "name": "@deepfates/splice", + "version": "0.1.1", "lockfileVersion": 3, "requires": true, "packages": { "": { - "name": "splice", - "version": "0.1.0", + "name": "@deepfates/splice", + "version": "0.1.1", "license": "MIT", + "dependencies": { + "cosmiconfig": "^9.0.0" + }, "bin": { - "splice": "dist/splice.js" + "splice": "dist/cli/splice.js" }, "devDependencies": { "@types/node": "^22.7.4", @@ -22,6 +25,29 @@ "node": ">=18" } }, + "node_modules/@babel/code-frame": { + "version": "7.27.1", + "resolved": "https://registry.npmjs.org/@babel/code-frame/-/code-frame-7.27.1.tgz", + "integrity": "sha512-cjQ7ZlQ0Mv3b47hABuTevyTuYN4i+loJKGeV9flcCgIK37cCXRh+L1bd3iBHlynerhQ7BhCkn2BPbQUL+rGqFg==", + "license": "MIT", + "dependencies": { + "@babel/helper-validator-identifier": "^7.27.1", + "js-tokens": "^4.0.0", + "picocolors": "^1.1.1" + }, + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@babel/helper-validator-identifier": { + "version": "7.27.1", + "resolved": "https://registry.npmjs.org/@babel/helper-validator-identifier/-/helper-validator-identifier-7.27.1.tgz", + "integrity": "sha512-D2hP9eA+Sqx1kBZgzxZh0y1trbuU+JoDkiEwqhQ36nodYqJwyEIhPSdMNd7lOm/4io72luTPWH20Yda0xOuUow==", + "license": "MIT", + "engines": { + "node": ">=6.9.0" + } + }, "node_modules/@esbuild/aix-ppc64": { "version": "0.25.10", "resolved": "https://registry.npmjs.org/@esbuild/aix-ppc64/-/aix-ppc64-0.25.10.tgz", @@ -929,6 +955,12 @@ "url": "https://opencollective.com/vitest" } }, + "node_modules/argparse": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/argparse/-/argparse-2.0.1.tgz", + "integrity": "sha512-8+9WqebbFzpX9OR+Wa6O29asIogeRMzcGtAINdpMHHyAg10f05aSFVBbcEqGf/PXw1EjAZ+q2/bEBg3DvurK3Q==", + "license": "Python-2.0" + }, "node_modules/assertion-error": { "version": "2.0.1", "resolved": "https://registry.npmjs.org/assertion-error/-/assertion-error-2.0.1.tgz", @@ -949,6 +981,15 @@ "node": ">=8" } }, + "node_modules/callsites": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/callsites/-/callsites-3.1.0.tgz", + "integrity": "sha512-P8BjAsXvZS+VIDUI11hHCQEv74YT67YUi5JJFNWIqL235sBmjX4+qx9Muvls5ivyNENctx46xQLQ3aTuE7ssaQ==", + "license": "MIT", + "engines": { + "node": ">=6" + } + }, "node_modules/chai": { "version": "5.3.3", "resolved": "https://registry.npmjs.org/chai/-/chai-5.3.3.tgz", @@ -976,6 +1017,32 @@ "node": ">= 16" } }, + "node_modules/cosmiconfig": { + "version": "9.0.0", + "resolved": "https://registry.npmjs.org/cosmiconfig/-/cosmiconfig-9.0.0.tgz", + "integrity": "sha512-itvL5h8RETACmOTFc4UfIyB2RfEHi71Ax6E/PivVxq9NseKbOWpeyHEOIbmAw1rs8Ak0VursQNww7lf7YtUwzg==", + "license": "MIT", + "dependencies": { + "env-paths": "^2.2.1", + "import-fresh": "^3.3.0", + "js-yaml": "^4.1.0", + "parse-json": "^5.2.0" + }, + "engines": { + "node": ">=14" + }, + "funding": { + "url": "https://github.com/sponsors/d-fischer" + }, + "peerDependencies": { + "typescript": ">=4.9.5" + }, + "peerDependenciesMeta": { + "typescript": { + "optional": true + } + } + }, "node_modules/cross-spawn": { "version": "7.0.6", "resolved": "https://registry.npmjs.org/cross-spawn/-/cross-spawn-7.0.6.tgz", @@ -1019,6 +1086,24 @@ "node": ">=6" } }, + "node_modules/env-paths": { + "version": "2.2.1", + "resolved": "https://registry.npmjs.org/env-paths/-/env-paths-2.2.1.tgz", + "integrity": "sha512-+h1lkLKhZMTYjog1VEpJNG7NZJWcuc2DDk/qsqSTRRCOXiLjeQ1d1/udrUGhqMxUgAlwKNZ0cf2uqan5GLuS2A==", + "license": "MIT", + "engines": { + "node": ">=6" + } + }, + "node_modules/error-ex": { + "version": "1.3.4", + "resolved": "https://registry.npmjs.org/error-ex/-/error-ex-1.3.4.tgz", + "integrity": "sha512-sqQamAnR14VgCr1A618A3sGrygcpK+HEbenA/HiEAkkUwcZIIB/tgWqHFxWgOyDh4nB4JCRimh79dR5Ywc9MDQ==", + "license": "MIT", + "dependencies": { + "is-arrayish": "^0.2.1" + } + }, "node_modules/es-module-lexer": { "version": "1.7.0", "resolved": "https://registry.npmjs.org/es-module-lexer/-/es-module-lexer-1.7.0.tgz", @@ -1186,6 +1271,28 @@ "node": ">=18.18.0" } }, + "node_modules/import-fresh": { + "version": "3.3.1", + "resolved": "https://registry.npmjs.org/import-fresh/-/import-fresh-3.3.1.tgz", + "integrity": "sha512-TR3KfrTZTYLPB6jUjfx6MF9WcWrHL9su5TObK4ZkYgBdWKPOFoSoQIdEuTuR82pmtxH2spWG9h6etwfr1pLBqQ==", + "license": "MIT", + "dependencies": { + "parent-module": "^1.0.0", + "resolve-from": "^4.0.0" + }, + "engines": { + "node": ">=6" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/is-arrayish": { + "version": "0.2.1", + "resolved": "https://registry.npmjs.org/is-arrayish/-/is-arrayish-0.2.1.tgz", + "integrity": "sha512-zz06S8t0ozoDXMG+ube26zeCTNXcKIPJZJi8hBrF4idCLms4CG9QtK7qBl1boi5ODzFpjswb5JPmHCbMpjaYzg==", + "license": "MIT" + }, "node_modules/is-plain-obj": { "version": "4.1.0", "resolved": "https://registry.npmjs.org/is-plain-obj/-/is-plain-obj-4.1.0.tgz", @@ -1232,6 +1339,36 @@ "dev": true, "license": "ISC" }, + "node_modules/js-tokens": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/js-tokens/-/js-tokens-4.0.0.tgz", + "integrity": "sha512-RdJUflcE3cUzKiMqQgsCu06FPu9UdIJO0beYbPhHN4k6apgJtifcoCtT9bcxOpYBtpD2kCM6Sbzg4CausW/PKQ==", + "license": "MIT" + }, + "node_modules/js-yaml": { + "version": "4.1.0", + "resolved": "https://registry.npmjs.org/js-yaml/-/js-yaml-4.1.0.tgz", + "integrity": "sha512-wpxZs9NoxZaJESJGIZTyDEaYpl0FKSA+FB9aJiyemKhMwkxQg63h4T1KJgUGHpTqPDNRcmmYLugrRjJlBtWvRA==", + "license": "MIT", + "dependencies": { + "argparse": "^2.0.1" + }, + "bin": { + "js-yaml": "bin/js-yaml.js" + } + }, + "node_modules/json-parse-even-better-errors": { + "version": "2.3.1", + "resolved": "https://registry.npmjs.org/json-parse-even-better-errors/-/json-parse-even-better-errors-2.3.1.tgz", + "integrity": "sha512-xyFwyhro/JEof6Ghe2iz2NcXoj2sloNsWr/XsERDK/oiPCfaNhl5ONfp+jQdAZRQQ0IJWNzH9zIZF7li91kh2w==", + "license": "MIT" + }, + "node_modules/lines-and-columns": { + "version": "1.2.4", + "resolved": "https://registry.npmjs.org/lines-and-columns/-/lines-and-columns-1.2.4.tgz", + "integrity": "sha512-7ylylesZQ/PV29jhEDl3Ufjo6ZX7gCqJr5F7PKrqc93v7fzSymt1BpwEU8nAUXs8qzzvqhbjhK5QZg6Mt/HkBg==", + "license": "MIT" + }, "node_modules/loupe": { "version": "3.2.1", "resolved": "https://registry.npmjs.org/loupe/-/loupe-3.2.1.tgz", @@ -1305,6 +1442,36 @@ "url": "https://github.com/sponsors/sindresorhus" } }, + "node_modules/parent-module": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/parent-module/-/parent-module-1.0.1.tgz", + "integrity": "sha512-GQ2EWRpQV8/o+Aw8YqtfZZPfNRWZYkbidE9k5rpl/hC3vtHHBfGm2Ifi6qWV+coDGkrUKZAxE3Lot5kcsRlh+g==", + "license": "MIT", + "dependencies": { + "callsites": "^3.0.0" + }, + "engines": { + "node": ">=6" + } + }, + "node_modules/parse-json": { + "version": "5.2.0", + "resolved": "https://registry.npmjs.org/parse-json/-/parse-json-5.2.0.tgz", + "integrity": "sha512-ayCKvm/phCGxOkYRSCM82iDwct8/EonSEgCSxWxD7ve6jHggsFl4fZVQBPRNgQoKiuV/odhFrGzQXZwbifC8Rg==", + "license": "MIT", + "dependencies": { + "@babel/code-frame": "^7.0.0", + "error-ex": "^1.3.1", + "json-parse-even-better-errors": "^2.3.0", + "lines-and-columns": "^1.1.6" + }, + "engines": { + "node": ">=8" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, "node_modules/parse-ms": { "version": "4.0.0", "resolved": "https://registry.npmjs.org/parse-ms/-/parse-ms-4.0.0.tgz", @@ -1349,7 +1516,6 @@ "version": "1.1.1", "resolved": "https://registry.npmjs.org/picocolors/-/picocolors-1.1.1.tgz", "integrity": "sha512-xceH2snhtb5M9liqDsmEw56le376mTZkEX/jEb/RxNFyegNul7eNslCXP9FDj/Lcu0X8KEyMceP2ntpaHrDEVA==", - "dev": true, "license": "ISC" }, "node_modules/postcss": { @@ -1397,6 +1563,15 @@ "url": "https://github.com/sponsors/sindresorhus" } }, + "node_modules/resolve-from": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/resolve-from/-/resolve-from-4.0.0.tgz", + "integrity": "sha512-pb/MYmXstAkysRFx8piNI1tGFNQIFA3vkE3Gq4EuA1dF6gHp/+vgZqsCGJapvy8N3Q+4o7FwvquPJcnZ7RYy4g==", + "license": "MIT", + "engines": { + "node": ">=4" + } + }, "node_modules/resolve-pkg-maps": { "version": "1.0.0", "resolved": "https://registry.npmjs.org/resolve-pkg-maps/-/resolve-pkg-maps-1.0.0.tgz", @@ -1597,7 +1772,7 @@ "version": "5.9.3", "resolved": "https://registry.npmjs.org/typescript/-/typescript-5.9.3.tgz", "integrity": "sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw==", - "dev": true, + "devOptional": true, "license": "Apache-2.0", "bin": { "tsc": "bin/tsc", diff --git a/src/cli/splice.ts b/src/cli/splice.ts new file mode 100644 index 0000000..f77e023 --- /dev/null +++ b/src/cli/splice.ts @@ -0,0 +1,268 @@ +#!/usr/bin/env -S tsx +/** + * splice — CLI entrypoint + * Wires sources → transforms → outputs using modular architecture. + * + * Maintains existing flags/behavior from the original monolithic script. + */ + +import * as path from "node:path"; +import * as fs from "node:fs/promises"; +import { fileURLToPath } from "node:url"; + +import { + CLIOptions, + parseArgs, + makeLogger, + usage, +} from "../core/types"; + +import { detectTwitterArchive, ingestTwitter } from "../sources/twitter"; +import { + applyFilters, + indexById, + groupThreadsAndConversations, +} from "../transforms/core"; +import { + writeMarkdown, + writeOAI, + writeNormalizedJSONL, + writeShareGPT, + writeStatsJSON, +} from "../outputs/writers"; + +/* -------------------------------- version -------------------------------- */ + +async function getVersion(): Promise { + try { + const thisFile = fileURLToPath(import.meta.url); + const dir = path.dirname(thisFile); + // src/cli/splice.ts -> ../../package.json + // dist/cli/splice.js -> ../../package.json + const pkgPath = path.join(dir, "..", "..", "package.json"); + const raw = await fs.readFile(pkgPath, "utf8"); + const pkg = JSON.parse(raw); + return typeof pkg.version === "string" ? pkg.version : "0.0.0"; + } catch { + return "0.0.0"; + } +} + +/* ---------------------------------- main ---------------------------------- */ + +async function main() { + const opts: CLIOptions = parseArgs(process.argv); + if (opts.help) { + process.stderr.write(usage() + "\n"); + process.exit(0); + } + if (opts.version) { + const v = await getVersion(); + process.stdout.write(`splice ${v}\n`); + process.exit(0); + } + + // Allow quick verbosity shorthands unless an explicit --log-level was provided + { + const argv = process.argv.slice(2); + const hasExplicitLogLevel = argv.includes("--log-level"); + const wantsQuiet = argv.includes("--quiet") || argv.includes("-q"); + const wantsVerbose = argv.includes("--verbose"); + if (!hasExplicitLogLevel) { + if (wantsQuiet) (opts as any).logLevel = "error"; + else if (wantsVerbose) (opts as any).logLevel = "debug"; + } + } + + const logger = makeLogger(opts.logLevel); + + // Warn on unknown flags with a simple suggestion + { + const argv = process.argv.slice(2); + const known = new Set([ + "--help", + "-h", + "--version", + "-V", + "--source", + "--archive-path", + "--out", + "--output-dir", + "--format", + "--formats", + "--output-formats", + "--system-message", + "--system", + "--dry-run", + "-n", + "--log-level", + "--quiet", + "-q", + "--verbose", + "--json-stdout", + "--since", + "--until", + "--min-length", + "--exclude-rt", + "--only-threads", + "--with-media", + "--stats-json", + "--", + ]); + const unknown = argv.filter( + (a) => a.startsWith("-") && !known.has(a) && a !== "-" && a !== "--", + ); + const candidates = Array.from(known).filter((f) => f.startsWith("--")); + const suggest = (flag: string): string | null => { + let best: string | null = null; + let score = -1; + for (const c of candidates) { + // simple common prefix score + let s = 0; + const L = Math.min(flag.length, c.length); + for (let i = 0; i < L; i++) { + if (flag[i] === c[i]) s++; + else break; + } + if (s > score) { + score = s; + best = c; + } + } + return score >= 2 ? best : null; + }; + for (const uf of unknown) { + const hint = suggest(uf); + if (hint) logger("warn", `Unknown flag ${uf}. Did you mean ${hint}?`); + else + logger( + "warn", + `Unknown flag ${uf}. Run with --help to see supported flags.`, + ); + } + } + + if (!opts.source || !opts.out) { + process.stderr.write(usage() + "\n"); + process.exit(2); + } + + const source = path.resolve(opts.source); + const outDir = path.resolve(opts.out); + + const detected = await detectTwitterArchive(source); + if (!detected) { + logger( + "error", + `Could not detect a Twitter archive at ${source} (missing data/manifest.js)`, + ); + process.exit(2); + } + + try { + logger("info", `Ingesting from ${source}`); + const items = await ingestTwitter(source, logger); + + const filtered = applyFilters(items, { + since: opts.since, + until: opts.until, + minLength: opts.minLength, + excludeRt: opts.excludeRt, + onlyThreads: opts.onlyThreads, + withMedia: opts.withMedia, + }); + + const all = indexById(filtered); + let { threads, conversations } = groupThreadsAndConversations(all); + if (opts.onlyThreads) { + conversations = []; + } + logger( + "info", + `Threads: ${threads.length}, Conversations: ${conversations.length}`, + ); + + // Validate formats and support --json-stdout for piping normalized items + const argv = process.argv.slice(2); + const formatSpecified = + argv.includes("--format") || + argv.includes("--formats") || + argv.includes("--output-formats"); + const allowedFormats = new Set(["markdown", "oai", "json", "sharegpt"]); + const requested = opts.format || []; + const validFormats = requested.filter((f) => allowedFormats.has(f)); + const invalidFormats = requested.filter((f) => !allowedFormats.has(f)); + for (const bad of invalidFormats) { + logger("warn", `Unknown format "${bad}". Supported: markdown, oai, json`); + } + const jsonStdout = argv.includes("--json-stdout"); + + if (jsonStdout) { + // Print normalized items as JSONL to stdout; logs remain on stderr + for (const it of items) { + process.stdout.write(JSON.stringify(it) + "\n"); + } + logger("info", "Wrote normalized items to stdout"); + process.exit(0); + } + + if (formatSpecified && validFormats.length === 0) { + logger( + "error", + "No valid formats requested. Supported: markdown, oai, json", + ); + process.stderr.write(usage() + "\n"); + process.exit(2); + } + + if (validFormats.includes("markdown")) { + await writeMarkdown( + threads, + opts.onlyThreads ? [] : filtered, + outDir, + logger, + opts.dryRun, + ); + } + if (validFormats.includes("json")) { + await writeNormalizedJSONL(items, outDir, logger, opts.dryRun); + } + const systemMessage = + process.env.SPLICE_SYSTEM_MESSAGE ?? opts.systemMessage; + logger("debug", `System message: ${systemMessage}`); + if (validFormats.includes("oai")) { + await writeOAI( + threads, + conversations, + outDir, + systemMessage, + logger, + opts.dryRun, + ); + } + if (validFormats.includes("sharegpt")) { + await writeShareGPT(threads, conversations, outDir, logger, opts.dryRun); + } + if (opts.statsJson) { + await writeStatsJSON( + filtered, + threads, + conversations, + outDir, + logger, + opts.dryRun, + ); + } + + logger("info", opts.dryRun ? "Dry run complete." : "Done."); + process.exit(0); + } catch (e) { + logger("error", (e as Error).message); + process.exit(1); + } +} + +main().catch((err) => { + process.stderr.write(`[error] ${(err as Error).message}\n`); + process.exit(1); +}); diff --git a/src/core/types.ts b/src/core/types.ts new file mode 100644 index 0000000..647bf73 --- /dev/null +++ b/src/core/types.ts @@ -0,0 +1,312 @@ +/** + * Core types, CLI args, logger, and shared utilities. + * Extracted to support a modular pipeline architecture. + */ + +import * as fs from "node:fs/promises"; +import * as path from "node:path"; + +/* --------------------------------- Types --------------------------------- */ + +export type Level = "debug" | "info" | "warn" | "error"; + +export type SourceId = "twitter:tweet" | "twitter:like" | string; + +export interface MediaAttachment { + id: string; + contentType: "photo" | "video" | "unknown"; + absPath?: string; // local absolute path if available + url?: string; // remote URL if available + metadata?: Record; +} + +export interface ContentItem { + id: string; + text: string; + createdAt: string; // ISO-8601 + parentId?: string | null; + source: SourceId; + raw?: Record; + media?: MediaAttachment[]; + annotations?: Record; +} + +export interface Thread { + id: string; + items: ContentItem[]; // ordered oldest → newest +} + +export type Role = "assistant" | "user"; + +export interface ChatMessage { + role: Role; + content: string; +} + +/* -------------------------------- Logger --------------------------------- */ + +export function makeLogger(level: Level): (lvl: Level, msg: string) => void { + const order: Level[] = ["debug", "info", "warn", "error"]; + const minIdx = order.indexOf(level); + return (lvl: Level, msg: string) => { + if (order.indexOf(lvl) >= minIdx) { + process.stderr.write(`[${lvl}] ${msg}\n`); + } + }; +} + +/* --------------------------------- Args ---------------------------------- */ + +export type CLIOptions = { + source?: string; + out?: string; + format: string[]; // e.g. ['markdown','oai'] + systemMessage: string; + dryRun: boolean; + logLevel: Level; + help: boolean; + version: boolean; + // filters + since?: string; + until?: string; + minLength: number; + excludeRt: boolean; + onlyThreads: boolean; + withMedia: boolean; + // outputs + statsJson: boolean; +}; + +export const DEFAULT_SYSTEM_MESSAGE = + "You have been uploaded to the internet"; + +export function parseArgs(argv: string[]): CLIOptions { + const opts: CLIOptions = { + format: ["markdown", "oai"], + systemMessage: DEFAULT_SYSTEM_MESSAGE, + dryRun: false, + logLevel: "info", + help: false, + version: false, + since: undefined, + until: undefined, + minLength: 0, + excludeRt: false, + onlyThreads: false, + withMedia: false, + statsJson: false, + }; + + const args = argv.slice(2); + let systemExplicit = false; + for (let i = 0; i < args.length; i++) { + const a = args[i]; + if (a === "--help" || a === "-h") { + opts.help = true; + } else if (a === "--version" || a === "-V") { + opts.version = true; + } else if (a === "--source" || a === "--archive-path") { + opts.source = args[++i]; + } else if (a === "--out" || a === "--output-dir") { + opts.out = args[++i]; + } else if ( + a === "--format" || + a === "--formats" || + a === "--output-formats" + ) { + const next = args[++i]; + if (!next) continue; + // allow space or comma separated + const parts = next.split(",").filter(Boolean); + if (parts.length > 1) opts.format = parts; + else { + // collect following non-flag tokens too (space-separated list) + const list = [next]; + while (args[i + 1] && !args[i + 1].startsWith("-")) { + list.push(args[++i]); + } + opts.format = list; + } + } else if (a === "--system-message" || a === "--system") { + const val = args[++i]; + if (val) { + opts.systemMessage = val; + systemExplicit = true; + } + } else if (a === "--dry-run" || a === "-n") { + opts.dryRun = true; + } else if (a === "--log-level") { + const lvl = (args[++i] ?? "").toLowerCase(); + if ( + lvl === "debug" || + lvl === "info" || + lvl === "warn" || + lvl === "error" + ) { + opts.logLevel = lvl; + } + } else if (a === "--since") { + opts.since = args[++i]; + } else if (a === "--until") { + opts.until = args[++i]; + } else if (a === "--min-length") { + const v = parseInt(args[++i] ?? "", 10); + if (!Number.isNaN(v)) opts.minLength = v; + } else if (a === "--exclude-rt") { + opts.excludeRt = true; + } else if (a === "--only-threads") { + opts.onlyThreads = true; + } else if (a === "--with-media") { + opts.withMedia = true; + } else if (a === "--stats-json") { + opts.statsJson = true; + } else if (a === "--") { + break; + } else if (a.startsWith("-")) { + // unknown flag; ignore to keep simple (CLI warns elsewhere) + } else { + // positional? ignore for now + } + } + if (!systemExplicit && process.env.SPLICE_SYSTEM_MESSAGE) { + opts.systemMessage = process.env.SPLICE_SYSTEM_MESSAGE as string; + } + return opts; +} + +export function usage(): string { + return [ + "splice — convert a Twitter archive to Markdown, OAI JSONL, and/or JSON", + "", + "Usage:", + " splice --source --out [--format markdown oai json sharegpt] [--system-message ]", + " [--since ] [--until ] [--min-length ] [--exclude-rt] [--only-threads] [--with-media]", + " [--dry-run] [--stats-json] [--log-level ] [--json-stdout] [--quiet|-q] [--verbose] [--version|-V]", + "", + "Options:", + " --source Path to the Twitter archive directory", + " --out Output directory", + " --format One or more formats: markdown, oai, json, sharegpt (default: markdown oai)", + ' --system, --system-message System message for OAI JSONL (default: "You have been uploaded to the internet")', + " --since Include items on/after this ISO date", + " --until Include items on/before this ISO date", + " --min-length Minimum text length", + " --exclude-rt Exclude retweets (RT ...)", + " --only-threads Output threads only (ignore conversations/non-thread tweets)", + " --with-media Only include items that have media", + " --dry-run, -n Plan only; don’t write files", + " --stats-json Write a stats.json summary", + " --log-level debug|info|warn|error (default: info)", + " --json-stdout Emit normalized items JSONL to stdout (no files); logs to stderr", + " --quiet, -q Errors only", + " --verbose Debug logging", + " --version, -V Show version", + " --help, -h Show help", + "", + "Examples:", + " splice --source ./archive --out ./out --format markdown oai json", + ' splice --source ./archive --out ./out --format oai --system-message "You are helpful."', + " splice --source ./archive --out ./out --since 2024-01-01 --only-threads", + " splice --source ./archive --out ./out --json-stdout", + " splice --version", + "", + "Docs: https://github.com/deepfates/splice • Context: https://deepfates.com/convert-your-twitter-archive-into-training-data", + ].join("\n"); +} + +/* --------------------------------- Utils --------------------------------- */ + +export function cleanJsonString(js: string): string { + // remove window.* = prefix and trailing semicolon + return js + .trim() + .replace(/^window\.[^=]+=\s*/i, "") + .replace(/;?\s*$/, ""); +} + +export async function readJsonFromJs(filePath: string): Promise { + const raw = await fs.readFile(filePath, "utf8"); + const cleaned = cleanJsonString(raw); + try { + return JSON.parse(cleaned); + } catch { + // try __THAR_CONFIG fallback + const match = raw.match(/window\.__THAR_CONFIG\s*=\s*({[\s\S]*?})\s*;?/); + if (match) return JSON.parse(match[1]); + throw new Error(`Could not parse JSON from ${filePath}`); + } +} + +/** + * Accepts strict JSON arrays or loose JS array/object literals. + * Returns [] on failure. + */ +export function parseLooseArray(input: string): any[] { + // Try strict JSON first + try { + const parsed = JSON.parse(input); + return Array.isArray(parsed) ? parsed : []; + } catch { + // Fall through to loose JS evaluation + } + + // Attempt to evaluate as a JS array/object literal in a confined context. + // cleanJsonString should have removed any "window.* = " prefix so input should be an array expression. + try { + // eslint-disable-next-line no-new-func + const fn = new Function('"use strict"; return (' + input + ");"); + const result = fn(); + return Array.isArray(result) ? result : []; + } catch { + return []; + } +} + +export async function loadConfig(): Promise { + try { + const mod: any = await import("cosmiconfig"); + const explorer = mod.cosmiconfig("splice"); + const result = await explorer.search(); + return result?.config; + } catch { + return undefined; + } +} + +export function mediaTypeFromExt( + filename: string, +): "photo" | "video" | "unknown" { + const ext = path.extname(filename).toLowerCase(); + if (ext === ".mp4" || ext === ".mov") return "video"; + if (ext === ".jpg" || ext === ".jpeg" || ext === ".png" || ext === ".gif") + return "photo"; + return "unknown"; +} + +export function sanitizeFilename(name: string, maxLen = 50): string { + return ( + name + .replace(/[^\w\-_ ]/g, "") + .trim() + .replace(/\s+/g, "_") + .slice(0, maxLen) || "untitled" + ); +} + +export function toIso(d: string | Date): string { + const dt = typeof d === "string" ? new Date(d) : d; + return Number.isNaN(dt.getTime()) + ? new Date().toISOString() + : dt.toISOString(); +} + +export function isRetweet(text: string): boolean { + return /^RT\b/.test(text || ""); +} + +export function formatIsoDateOnly(iso: string): string { + const d = new Date(iso); + return isNaN(d.getTime()) + ? new Date().toISOString().slice(0, 10) + : d.toISOString().slice(0, 10); +} diff --git a/src/outputs/writers.ts b/src/outputs/writers.ts new file mode 100644 index 0000000..33358ad --- /dev/null +++ b/src/outputs/writers.ts @@ -0,0 +1,279 @@ +import * as fs from "node:fs/promises"; +import * as path from "node:path"; +import { + ContentItem, + Thread, + Level, + formatIsoDateOnly, + sanitizeFilename, + isRetweet, +} from "../core/types"; +import { cleanText, messagesFromConversation } from "../transforms/core"; + +/** + * Ensure a directory exists (mkdir -p). + */ +async function ensureDir(p: string) { + await fs.mkdir(p, { recursive: true }); +} + +/** + * Copy media attachments for a set of items into imagesDir, prefixing names with "_". + * If an attachment lacks absPath, it will be skipped with a warning. + */ +async function copyMedia( + items: ContentItem[], + imagesDir: string, + logger: (l: Level, m: string) => void, +) { + await ensureDir(imagesDir); + for (const it of items) { + for (const m of it.media ?? []) { + const base = m.absPath ? path.basename(m.absPath) : `${m.id}.bin`; + try { + if (!m.absPath) { + logger("warn", `No absPath for media ${m.id}; skipping copy`); + continue; + } + await fs.copyFile(m.absPath, path.join(imagesDir, `_${base}`)); + } catch (e) { + logger( + "warn", + `Failed to copy media ${m.absPath ?? m.id}: ${(e as Error).message}`, + ); + } + } + } +} + +/** + * Write Markdown outputs: + * - threads/.md with frontmatter, cleaned text, media links, and link to Twitter + * - tweets_by_date/<YYYY-MM-DD>.md for non-thread tweets (excluding RTs) + * - images/_<file> copied for referenced items + */ +export async function writeMarkdown( + threads: Thread[], + items: ContentItem[], + outDir: string, + logger: (l: Level, m: string) => void, + dryRun: boolean, +) { + const threadsDir = path.join(outDir, "threads"); + const byDateDir = path.join(outDir, "tweets_by_date"); + const imagesDir = path.join(outDir, "images"); + + if (!dryRun) { + await ensureDir(threadsDir); + await ensureDir(byDateDir); + await ensureDir(imagesDir); + } + + // Copy media for all thread items + non-thread tweets + const threadItems = threads.flatMap((t) => t.items); + const threadIds = new Set(threadItems.map((i) => i.id)); + const nonThreadTweets = items.filter( + (i) => + i.source === "twitter:tweet" && + !i.parentId && + !threadIds.has(i.id) && + !isRetweet(i.text), + ); + const copyPool = threadItems.concat(nonThreadTweets); + + logger("info", `Preparing media for ${copyPool.length} items`); + if (!dryRun) await copyMedia(copyPool, imagesDir, logger); + + // Save threads + logger("info", `Saving ${threads.length} threads`); + for (const thread of threads) { + const first = thread.items[0]; + const date = formatIsoDateOnly(first.createdAt); + const fm = `---\nDate: ${date}\n---\n`; + + const parts: string[] = []; + for (const t of thread.items) { + const mediaLinks = (t.media ?? []).map((m) => { + const base = m.absPath ? path.basename(m.absPath) : `${m.id}.bin`; + return `![${base}](../images/_${base})`; + }); + const cleaned = cleanText(t.text, (t.raw as any)?.entities); + parts.push(`${cleaned}\n\n${mediaLinks.join("\n")}`.trim()); + } + + const firstWords = thread.items[0].text.split(/\s+/).slice(0, 5).join(" "); + const name = sanitizeFilename(firstWords) || thread.id; + const filePath = path.join(threadsDir, `${name}.md`); + const topLink = `https://twitter.com/i/web/status/${first.id}`; + const body = `${fm}\n${parts.join("\n\n")}\n\n[View on Twitter](${topLink})`; + + if (dryRun) { + logger("info", `(dry-run) would write thread file: ${filePath}`); + } else { + await fs.writeFile(filePath, body, "utf8"); + } + } + + // Save non-thread tweets by date + const byDate: Record<string, ContentItem[]> = {}; + for (const t of nonThreadTweets) { + const d = formatIsoDateOnly(t.createdAt); + (byDate[d] ||= []).push(t); + } + + for (const [date, dayItems] of Object.entries(byDate)) { + dayItems.sort((a, b) => a.createdAt.localeCompare(b.createdAt)); + const content = dayItems + .map((t) => { + const dt = new Date(t.createdAt); + const time = isNaN(dt.getTime()) + ? "" + : dt.toLocaleTimeString("en-US", { + hour: "numeric", + minute: "2-digit", + }); + const images = (t.media ?? []) + .map((m) => { + const base = m.absPath ? path.basename(m.absPath) : `${m.id}.bin`; + return `![${base}](../images/_${base})`; + }) + .join(""); + const cleaned = cleanText(t.text, (t.raw as any)?.entities); + return `*${time}* \n${cleaned}${images}`; + }) + .join("\n\n---\n\n"); + + const filePath = path.join(byDateDir, `${date}.md`); + if (dryRun) { + logger("info", `(dry-run) would write daily file: ${filePath}`); + } else { + await fs.writeFile(filePath, content, "utf8"); + } + } +} + +/** + * Write conversations in OpenAI JSONL format. + * Note: Includes a system message at the top of each conversation. + */ +export async function writeOAI( + threads: Thread[], + conversations: ContentItem[][], + outDir: string, + systemMessage: string, + logger: (l: Level, m: string) => void, + dryRun: boolean, +) { + const outPath = path.join(outDir, "conversations_oai.jsonl"); + if (dryRun) { + logger("info", `(dry-run) would write OAI JSONL: ${outPath}`); + return; + } + await ensureDir(path.dirname(outPath)); + const fh = await fs.open(outPath, "w"); + + const writeConv = async (items: ContentItem[]) => { + const msgs = messagesFromConversation(items); + if (!msgs.length) return; + const record = { + messages: [{ role: "system", content: systemMessage }, ...msgs], + }; + await fh.write(JSON.stringify(record) + "\n"); + }; + + for (const t of threads) await writeConv(t.items); + for (const c of conversations) await writeConv(c); + await fh.close(); + logger("info", `Wrote OAI JSONL to ${outPath}`); +} + +/** + * Write the normalized ContentItem stream as JSONL for downstream reuse. + */ +export async function writeNormalizedJSONL( + items: ContentItem[], + outDir: string, + logger: (l: Level, m: string) => void, + dryRun: boolean, +) { + const outPath = path.join(outDir, "normalized_items.jsonl"); + if (dryRun) { + logger("info", `(dry-run) would write normalized items JSONL: ${outPath}`); + return; + } + await ensureDir(path.dirname(outPath)); + const fh = await fs.open(outPath, "w"); + for (const it of items) { + await fh.write(JSON.stringify(it) + "\n"); + } + await fh.close(); + logger("info", `Wrote normalized items JSONL to ${outPath}`); +} + +/** + * Write ShareGPT JSON format from conversations derived from threads and mixed conversations. + */ +export async function writeShareGPT( + threads: Thread[], + conversations: ContentItem[][], + outDir: string, + logger: (l: Level, m: string) => void, + dryRun: boolean, +) { + const outPath = path.join(outDir, "sharegpt.json"); + if (dryRun) { + logger("info", `(dry-run) would write ShareGPT JSON: ${outPath}`); + return; + } + await ensureDir(path.dirname(outPath)); + const list: Array<{ conversations: Array<{ from: string; value: string }> }> = + []; + const addConv = async (items: ContentItem[]) => { + const msgs = messagesFromConversation(items); + if (!msgs.length) return; + list.push({ + conversations: msgs.map((m) => ({ + from: m.role === "user" ? "human" : "gpt", + value: m.content, + })), + }); + }; + for (const t of threads) await addConv(t.items); + for (const c of conversations) await addConv(c); + await fs.writeFile(outPath, JSON.stringify(list, null, 2), "utf8"); + logger("info", `Wrote ShareGPT JSON to ${outPath}`); +} + +/** + * Write a small stats.json summary about items, threads, conversations, and date range. + */ +export async function writeStatsJSON( + items: ContentItem[], + threads: Thread[], + conversations: ContentItem[][], + outDir: string, + logger: (l: Level, m: string) => void, + dryRun: boolean, +) { + const outPath = path.join(outDir, "stats.json"); + const dates = items + .map((i) => new Date(i.createdAt).toISOString()) + .filter(Boolean); + const start = dates.length ? dates.reduce((a, b) => (a < b ? a : b)) : null; + const end = dates.length ? dates.reduce((a, b) => (a > b ? a : b)) : null; + const stats = { + totalItems: items.length, + tweets: items.filter((i) => i.source === "twitter:tweet").length, + likes: items.filter((i) => i.source === "twitter:like").length, + threads: threads.length, + conversations: conversations.length, + dateRange: { start, end }, + }; + if (dryRun) { + logger("info", `(dry-run) would write stats JSON: ${outPath}`); + return; + } + await ensureDir(path.dirname(outPath)); + await fs.writeFile(outPath, JSON.stringify(stats, null, 2), "utf8"); + logger("info", `Wrote stats JSON to ${outPath}`); +} diff --git a/src/sources/twitter.ts b/src/sources/twitter.ts new file mode 100644 index 0000000..9ec5e12 --- /dev/null +++ b/src/sources/twitter.ts @@ -0,0 +1,140 @@ +import * as fs from "node:fs/promises"; +import * as path from "node:path"; +import { + Level, + ContentItem, + MediaAttachment, + readJsonFromJs, + parseLooseArray, + mediaTypeFromExt, + toIso, + cleanJsonString, +} from "../core/types"; + +/** + * Subset of the Twitter/X archive manifest schema + */ +type Manifest = { + dataTypes?: Record<string, { files?: Array<{ fileName: string }> }>; +}; + +/** + * Detect whether a directory looks like a Twitter/X archive by checking for data/manifest.js + */ +export async function detectTwitterArchive(rootPath: string): Promise<boolean> { + try { + const p = path.join(rootPath, "data", "manifest.js"); + await fs.stat(p); + return true; + } catch { + return false; + } +} + +/** + * Return media file basenames for a given tweet id. + * Filters out zero-byte files to avoid broken copies. + */ +async function getMediaFiles(root: string, id: string): Promise<string[]> { + const mediaDir = path.join(root, "data", "tweets_media"); + try { + const files = await fs.readdir(mediaDir); + const filtered: string[] = []; + for (const f of files) { + if (!f.startsWith(`${id}-`)) continue; + const stat = await fs.stat(path.join(mediaDir, f)); + if (stat.size > 0) filtered.push(f); + } + return filtered; + } catch { + return []; + } +} + +/** + * Normalize a raw tweet/like structure from the archive format + */ +function normalizeTweetLike( + item: any, + _source: "twitter:tweet" | "twitter:like", +): { + id: string; + text: string; + created_at: string; + parent_id?: string | null; + raw: any; +} | null { + const t = item?.tweet ?? item?.like ?? item; + if (!t) return null; + const id = t.id || t.tweetId; + if (!id) return null; + const text = t.text || t.fullText || t.full_text || ""; + const created_at = t.created_at || t.createdAt || ""; + const parent_id = t.in_reply_to_status_id || t.inReplyTo || null; + return { id, text, created_at, parent_id, raw: t }; +} + +/** + * Ingest a Twitter/X archive into normalized ContentItem records + */ +export async function ingestTwitter( + rootPath: string, + logger: (l: Level, m: string) => void, +): Promise<ContentItem[]> { + const manifestPath = path.join(rootPath, "data", "manifest.js"); + const manifest: Manifest = await readJsonFromJs(manifestPath); + const types = manifest.dataTypes ?? {}; + const out: ContentItem[] = []; + + const selected: Array<"tweets" | "like"> = Object.keys(types).filter( + (t) => t === "tweets" || t === "like", + ) as any; + + for (const dataType of selected) { + const info = types[dataType]; + const files = info?.files ?? []; + if (!files.length) continue; + + logger("info", `Processing ${files.length} files for ${dataType}`); + + for (const f of files) { + const filePath = path.join(rootPath, f.fileName); + const raw = await fs.readFile(filePath, "utf8"); + const cleaned = cleanJsonString(raw); + const data = parseLooseArray(cleaned); + if (!Array.isArray(data) || data.length === 0) continue; + + for (const item of data) { + const norm = normalizeTweetLike( + item, + dataType === "tweets" ? "twitter:tweet" : "twitter:like", + ); + if (!norm) continue; + + const mediaFiles = await getMediaFiles(rootPath, norm.id); + const media: MediaAttachment[] = mediaFiles.map((fn) => ({ + id: `${norm.id}_${fn.replace(/\.\w+$/, "")}`, + contentType: mediaTypeFromExt(fn), + absPath: path.join(rootPath, "data", "tweets_media", fn), + metadata: { + parent: norm.id, + media_info: norm.raw?.extended_entities?.media ?? [], + }, + })); + + out.push({ + id: norm.id, + text: norm.text, + createdAt: norm.created_at ? toIso(norm.created_at) : new Date().toISOString(), + parentId: norm.parent_id ?? null, + source: dataType === "tweets" ? "twitter:tweet" : "twitter:like", + raw: norm.raw, + media, + }); + } + } + } + + logger("info", `Total normalized items: ${out.length}`); + return out; +} diff --git a/src/transforms/core.ts b/src/transforms/core.ts new file mode 100644 index 0000000..97a66fc --- /dev/null +++ b/src/transforms/core.ts @@ -0,0 +1,136 @@ +import { ContentItem, Thread, ChatMessage, Role, isRetweet } from "../core/types"; + +/** + * Replace shortened URLs with expanded, strip t.co links, mentions, hashtags, + * collapse whitespace and trim. + */ +export function cleanText( + text: string, + entities?: { urls?: Array<{ url: string; expanded_url?: string }> }, +): string { + let t = text ?? ""; + if (entities?.urls) { + for (const u of entities.urls) { + if (u.url && u.expanded_url) t = t.split(u.url).join(u.expanded_url); + } + } + t = t.replace(/https:\/\/t\.co\/\w+/g, ""); + t = t.replace(/@\w+/g, ""); + t = t.replace(/#\w+/g, ""); + t = t.replace(/\s+/g, " "); + return t.trim(); +} + +export type FilterOptions = { + since?: string; + until?: string; + minLength: number; + excludeRt: boolean; + onlyThreads: boolean; // reserved for higher-level logic; not applied here + withMedia: boolean; +}; + +/** + * Apply stateless filters to a list of ContentItem. + * Note: onlyThreads is intentionally ignored here; thread selection happens after grouping. + */ +export function applyFilters(items: ContentItem[], opts: FilterOptions): ContentItem[] { + const sinceTime = opts.since ? new Date(opts.since).getTime() : -Infinity; + const untilTime = opts.until ? new Date(opts.until).getTime() : Infinity; + + return items.filter((it) => { + const t = new Date(it.createdAt).getTime(); + if (!(t >= sinceTime && t <= untilTime)) return false; + if (opts.excludeRt && isRetweet(it.text)) return false; + if (opts.minLength > 0 && (it.text?.trim().length ?? 0) < opts.minLength) return false; + if (opts.withMedia && !(it.media && it.media.length > 0)) return false; + return true; + }); +} + +/** + * Build a fast lookup map of items by id. + */ +export function indexById(items: ContentItem[]): Record<string, ContentItem> { + const m: Record<string, ContentItem> = {}; + for (const it of items) { + if (it.id) m[it.id] = it; + } + return m; +} + +/** + * Group items into tweet threads and mixed-source conversations. + * Threads are chains where all items come from "twitter:tweet". + * Conversations are chains which include other sources or likes. + */ +export function groupThreadsAndConversations(all: Record<string, ContentItem>): { + threads: Thread[]; + conversations: ContentItem[][]; +} { + const processed = new Set<string>(); + const threads: Thread[] = []; + const conversations: ContentItem[][] = []; + + const items = Object.values(all); + for (const item of items) { + if (processed.has(item.id)) continue; + + const chain: ContentItem[] = [item]; + let current = item; + while (current.parentId && all[current.parentId]) { + const parent = all[current.parentId]; + chain.push(parent); + current = parent; + if (processed.has(current.id)) break; + } + for (const c of chain) processed.add(c.id); + + const allTweets = chain.every((c) => c.source === "twitter:tweet"); + if (allTweets) { + const ordered = chain.slice().reverse(); // oldest → newest + threads.push({ id: ordered[0].id, items: ordered }); + } else { + conversations.push(chain.slice().reverse()); // oldest → newest + } + } + + return { threads, conversations }; +} + +/** + * Convert a conversation (ordered list of ContentItems) into ChatMessages: + * - Simple heuristic for roles (maintains prior behavior). + * - Clean text using cleanText(). + * - Merge consecutive messages from the same role. + * - Trim trailing user messages to end on assistant if possible. + */ +export function messagesFromConversation(items: ContentItem[]): ChatMessage[] { + const msgs: ChatMessage[] = []; + let currentRole: Role | undefined; + let currentContent: string[] = []; + + function flush() { + if (!currentRole) return; + const content = currentContent.join("\n\n").trim(); + if (content) msgs.push({ role: currentRole, content }); + currentContent = []; + } + + for (const it of items) { + const role: Role = it.raw && "full_text" in (it.raw as any) ? "assistant" : "user"; + const cleaned = cleanText(it.text, (it.raw as any)?.entities); + if (!cleaned) continue; + + if (role !== currentRole && currentRole) flush(); + currentRole = role; + currentContent.push(cleaned); + } + flush(); + + // Trim to last assistant message if present + for (let i = msgs.length - 1; i >= 0; i--) { + if (msgs[i].role === "assistant") return msgs.slice(0, i + 1); + } + return []; +} diff --git a/tsconfig.json b/tsconfig.json new file mode 100644 index 0000000..f67c65c --- /dev/null +++ b/tsconfig.json @@ -0,0 +1,30 @@ +{ + "compilerOptions": { + "target": "ES2022", + "module": "ES2022", + "moduleResolution": "Bundler", + "lib": ["ES2022"], + "rootDir": "src", + "outDir": "dist", + "baseUrl": ".", + "paths": { + "@core/*": ["src/core/*"], + "@sources/*": ["src/sources/*"], + "@transforms/*": ["src/transforms/*"], + "@outputs/*": ["src/outputs/*"], + "@cli/*": ["src/cli/*"] + }, + "types": ["node"], + "resolveJsonModule": true, + "esModuleInterop": true, + "allowSyntheticDefaultImports": true, + "verbatimModuleSyntax": false, + "moduleDetection": "force", + "skipLibCheck": true, + "sourceMap": true, + "declaration": false, + "noEmitOnError": true + }, + "include": ["src/**/*"], + "exclude": ["node_modules", "dist", "tests/**"] +} From 502761ba160060ffd4c43468317af844742b3e71 Mon Sep 17 00:00:00 2001 From: deepfates <deepfates@gmail.com> Date: Sat, 18 Oct 2025 16:21:58 -0700 Subject: [PATCH 2/4] feat(api): expose library entrypoint for composing sources/transforms/outputs and plugging in custom adapters --- package.json | 5 ++- src/index.ts | 94 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 98 insertions(+), 1 deletion(-) create mode 100644 src/index.ts diff --git a/package.json b/package.json index 9670686..77a127b 100644 --- a/package.json +++ b/package.json @@ -5,7 +5,10 @@ "license": "MIT", "author": "", "type": "module", - "main": "dist/splice.js", + "main": "dist/index.js", + "exports": { + ".": "./dist/index.js" + }, "bin": { "splice": "dist/splice.js" }, diff --git a/src/index.ts b/src/index.ts new file mode 100644 index 0000000..132eed5 --- /dev/null +++ b/src/index.ts @@ -0,0 +1,94 @@ +/** + * Public library API + * + * This module re-exports the core types, utilities, source adapters, transforms, + * and output writers so consumers can: + * - Import only the pieces they need + * - Plug in proprietary/custom sources or outputs without forking + * - Compose their own pipelines programmatically + * + * Backwards-compatibility + * - The CLI uses the same functions exported here. + * - These extension interfaces (SourceAdapter/Transform/OutputAdapter) are intended + * to remain stable; changes will be signaled with semver. + */ + +// Re-export shared types, args, logger, and utilities +export * from "./core/types"; + +// Re-export built-in Source(s) +export * from "./sources/twitter"; + +// Re-export built-in Transforms +export * from "./transforms/core"; + +// Re-export built-in Outputs +export { + writeMarkdown, + writeOAI, + writeNormalizedJSONL, + writeShareGPT, + writeStatsJSON, +} from "./outputs/writers"; + +/* ------------------------------- Extensions ------------------------------- */ + +import type { Level, ContentItem, Thread } from "./core/types"; + +/** + * Logger signature used across the pipeline + */ +export type Logger = (level: Level, message: string) => void; + +/** + * A pluggable input adapter for new sources (e.g., Bluesky, ChatGPT exports, custom archives). + * Implementors normalize their inputs to ContentItem[] and preserve rich metadata in `raw`. + */ +export interface SourceAdapter { + kind: string; // e.g., "twitter", "bluesky", "chatgpt", "custom:foo" + detect(pathOrUri: string): Promise<boolean>; + ingest(pathOrUri: string, log: Logger): Promise<ContentItem[]>; +} + +/** + * Generic transform step. Keep these pure where possible so results + * can be cached by input hash + config hash when we add checkpointing. + */ +export interface Transform<Input, Output> { + name: string; // e.g., "filter", "group:threads", "score:length" + apply( + input: Input, + config: Record<string, unknown>, + ): Promise<{ output: Output; stats?: Record<string, number> }>; +} + +/** + * Context provided to OutputAdapters. + */ +export interface OutputAdapterContext { + outDir: string; + dryRun?: boolean; + logger: Logger; +} + +/** + * Arguments passed to OutputAdapters. + * Consumers can pass only what their adapter needs; undefined fields can be ignored. + */ +export interface OutputWriteArgs { + items?: ContentItem[]; + threads?: Thread[]; + conversations?: ContentItem[][]; + systemMessage?: string; + // room for future fields (e.g., selection metadata, annotations, etc.) + [key: string]: unknown; +} + +/** + * A pluggable output adapter for new render targets. + * Examples: proprietary JSONL, HTML site, custom training data, etc. + */ +export interface OutputAdapter { + name: string; // e.g., "markdown", "oai", "custom:myformat" + write(args: OutputWriteArgs, ctx: OutputAdapterContext): Promise<void>; +} From 76155b9101b8d6391a89bad5525fd81f498b58e9 Mon Sep 17 00:00:00 2001 From: deepfates <deepfates@gmail.com> Date: Sat, 18 Oct 2025 16:32:28 -0700 Subject: [PATCH 3/4] docs: update README for modular architecture and library API; export public API for adapters --- README.md | 109 ++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 81 insertions(+), 28 deletions(-) diff --git a/README.md b/README.md index 202ebf2..3bd0ead 100644 --- a/README.md +++ b/README.md @@ -1,43 +1,48 @@ # 🫚 splice -Convert your Twitter/X archive into normalized threads and export to Markdown, OAI JSONL, JSON (normalized items), and ShareGPT. Single-file TypeScript CLI, human-first, composable. - -- Human-friendly CLI (clig.dev principles) -- Outputs: - - Markdown per-thread, plus non-thread tweets grouped by date - - OAI-compatible JSONL for language model fine-tuning/evaluation - - Normalized items JSONL (one item per line, for debugging/inspection) +Convert social/chat archives into normalized threads and export to Markdown, OAI JSONL, JSON (normalized items), and ShareGPT. Modular TypeScript CLI and library with extensible sources → transforms → outputs. + +- Idiomatic CLI (clig.dev principles) +- Modular architecture: + - sources: Twitter/X today; Bluesky, ChatGPT, etc. next + - transforms: filtering, grouping into threads/conversations, text cleaning + - outputs: Markdown, OAI JSONL, JSONL (normalized items), ShareGPT +- Library API to compose your own pipeline or plug in proprietary adapters - Copies referenced media into an images/ folder -- Works directly with your Twitter archive (manifest.js + data files) +- JSONL artifacts for easy inspection and future checkpointing ## Why -A minimalist CLI to turn your Twitter archive into: -- Markdown you can read or publish -- OAI JSONL you can train on -- A normalized JSONL dump for inspection +Turn your archives into: +- Readable Markdown +- OAI-compatible JSONL for training/eval +- A normalized JSONL dump for inspection and reuse Today it imports Twitter/X. The plan is to splice in other archives (Bluesky, ChatGPT, Reddit, Glowfic, Hugging Face, …) and let you pick the strands you want to weave into a training set. -This library started life as a python script. This is a TypeScript rewrite where development will continue. Versions of this codebase were used in the development of [deeperfates.com](https://deeperfates.com), [keltham.lol](https://keltham.lol), [youaretheassistantnow.com](https://youaretheassistantnow.com) and other personality clones. +This library started life as a Python script. This is a TypeScript rewrite where development will continue. It has powered projects like [deeperfates.com](https://deeperfates.com), [keltham.lol](https://keltham.lol), and [youaretheassistantnow.com](https://youaretheassistantnow.com). More context: https://deepfates.com/convert-your-twitter-archive-into-training-data -## Quick start +## Quick start (CLI) Requirements: -- Node.js 18+ (tested with recent Node LTS and current) -- For direct execution: `tsx` (installed automatically when using `npx`) +- Node.js 18+ (tested with recent LTS) +- For direct execution: `tsx` (installed automatically with `npx`) Run with tsx (no build needed): npx tsx splice.ts --source /path/to/twitter-archive --out ./out +Run the published CLI (after install): + + npx splice --source /path/to/twitter-archive --out ./out + Build then run with Node: npm install npm run build - node dist/splice.js --source /path/to/twitter-archive --out ./out + node dist/cli/splice.js --source /path/to/twitter-archive --out ./out Dev/watch mode: @@ -50,18 +55,30 @@ Help (equivalent to `--help`): splice — convert a Twitter archive to Markdown, OAI JSONL, and/or JSON Usage: - splice --source <path> --out <dir> [--format markdown oai json] [--system-message <text>] [--dry-run] [--log-level <level>] + splice --source <path> --out <dir> [--format markdown oai json sharegpt] [--system-message <text>] + [--since <iso>] [--until <iso>] [--min-length <n>] [--exclude-rt] [--only-threads] [--with-media] + [--dry-run] [--stats-json] [--log-level <level>] [--json-stdout] [--quiet|-q] [--verbose] [--version|-V] Options: --source <path> Path to the Twitter archive directory --out <dir> Output directory - --format <fmt...> One or more formats: markdown, oai, json (default: markdown oai) + --format <fmt...> One or more formats: markdown, oai, json, sharegpt (default: markdown oai) --system-message <text> System message for OAI JSONL (default: "You have been uploaded to the internet") Alias: --system + --since <iso> Include items on/after this ISO date + --until <iso> Include items on/before this ISO date + --min-length <n> Minimum text length + --exclude-rt Exclude retweets (RT ...) + --only-threads Output threads only + --with-media Only include items that have media --dry-run, -n Plan only; don’t write files + --stats-json Write a stats.json summary --log-level <level> debug|info|warn|error (default: info) - --help, -h Show help + --json-stdout Emit normalized items JSONL to stdout; logs to stderr + --quiet, -q Errors only + --verbose Debug logging --version, -V Show version + --help, -h Show help Environment: SPLICE_SYSTEM_MESSAGE Alternative way to set the OAI system message @@ -73,7 +90,7 @@ Exit codes: - 2: invalid arguments or source detection failed Stdout/Stderr: -- Primary logs and progress go to stderr (so you can pipe stdout safely when we add stdout formats) +- Primary logs go to stderr (so you can safely pipe stdout) - Data files are written to the output directory ## Examples @@ -120,7 +137,7 @@ Dry run with debug logs (no files written): ## Input assumptions -This first version supports the standard Twitter archive ZIP extracted to a directory that contains: +Supports the standard Twitter/X archive ZIP extracted to a directory that contains: - `data/manifest.js` - `data/tweets_media/` (optional, for media assets) @@ -143,9 +160,44 @@ On a successful run, you’ll see: - `out/stats.json` — summary (counts, threads/conversations, date range) Notes: -- Filenames for threads are derived from the first five words of the top post (sanitized). +- Thread filenames are derived from the top post’s first words (sanitized). - The OAI JSONL file includes a top-level “system” message (configurable). +## Architecture (for contributors) + +- src/core — shared types, arg parsing, logger, utilities +- src/sources — input adapters (twitter.ts) +- src/transforms — filters, grouping, conversation mapping +- src/outputs — writers for markdown/oai/json/sharegpt/stats +- src/cli — CLI entrypoint wiring sources → transforms → outputs + +The code is structured so you can add new sources, transforms, or outputs without touching unrelated parts. + +## Library usage + +You can import and compose pieces in your own app: + +```ts +import { + ingestTwitter, + applyFilters, + indexById, + groupThreadsAndConversations, + writeOAI, +} from "@deepfates/splice"; + +const items = await ingestTwitter("/path/to/archive", (l, m) => console.error(`[${l}] ${m}`)); +const filtered = applyFilters(items, { minLength: 20, excludeRt: true, withMedia: false }); +const all = indexById(filtered); +const { threads, conversations } = groupThreadsAndConversations(all); +await writeOAI(threads, conversations, "./out", "You have been uploaded to the internet", (l, m) => console.error(`[${l}] ${m}`), false); +``` + +Pluggable adapters (build proprietary ones privately and upstream later if you want): + +- SourceAdapter: `detect(pathOrUri)`, `ingest(pathOrUri, logger) → ContentItem[]` +- OutputAdapter: `write(args, ctx)` where args may include `items`, `threads`, `conversations`, `systemMessage`, and ctx provides `outDir`, `dryRun`, and `logger` + ## Development Install deps: @@ -160,17 +212,17 @@ Watch mode: npm run dev -- --source /path/to/twitter-archive --out ./out -Build (emits `dist/splice.js` and sets up the `splice` bin): +Build (emits `dist/cli/splice.js` and sets up the `splice` bin; library API at `dist/index.js`): npm run build Run the built CLI: - node dist/splice.js --source /path/to/twitter-archive --out ./out + node dist/cli/splice.js --source /path/to/twitter-archive --out ./out ## Testing -Run the full test suite (includes an integration test that verifies Markdown, OAI JSONL with system message, and normalized JSONL outputs): +Run the full test suite (includes integration tests for Markdown, OAI JSONL with system message, media copying, and normalized JSONL): npm test @@ -181,9 +233,10 @@ Watch tests: ## Roadmap (short) - More inputs: Bluesky, Reddit, ChatGPT, Glowfic, HF datasets -- More outputs: ShareGPT, SQLite/Parquet/CSV +- Checkpointing and resumable pipelines (JSONL-based manifests) +- More outputs: ShareGPT enhancements, SQLite/Parquet/CSV - Better selection: persona/character filters, time ranges -- Note tweets and improved role attribution +- Improved role attribution and metadata preservation ## License From 5a8a53da7aa5401fa8ab21878969a6f54f42cf9b Mon Sep 17 00:00:00 2001 From: deepfates <deepfates@gmail.com> Date: Sat, 18 Oct 2025 16:52:30 -0700 Subject: [PATCH 4/4] chore: address review nits (bin path, format messages, role inference helper) --- package.json | 8 ++++---- src/cli/splice.ts | 14 ++++++-------- src/transforms/core.ts | 27 ++++++++++++++++++++++----- 3 files changed, 32 insertions(+), 17 deletions(-) diff --git a/package.json b/package.json index 77a127b..06fb080 100644 --- a/package.json +++ b/package.json @@ -10,7 +10,7 @@ ".": "./dist/index.js" }, "bin": { - "splice": "dist/splice.js" + "splice": "dist/cli/splice.js" }, "files": [ "dist/**", @@ -19,9 +19,9 @@ "CHANGELOG.md" ], "scripts": { - "start": "tsx splice.ts", - "dev": "tsx watch splice.ts", - "build": "tsc --target ES2022 --module ES2022 --moduleResolution Bundler --outDir dist splice.ts", + "start": "tsx src/cli/splice.ts", + "dev": "tsx watch src/cli/splice.ts", + "build": "tsc -p tsconfig.json", "prepare": "npm run build", "test": "vitest run --reporter verbose", "test:watch": "vitest", diff --git a/src/cli/splice.ts b/src/cli/splice.ts index f77e023..55b972c 100644 --- a/src/cli/splice.ts +++ b/src/cli/splice.ts @@ -10,12 +10,7 @@ import * as path from "node:path"; import * as fs from "node:fs/promises"; import { fileURLToPath } from "node:url"; -import { - CLIOptions, - parseArgs, - makeLogger, - usage, -} from "../core/types"; +import { CLIOptions, parseArgs, makeLogger, usage } from "../core/types"; import { detectTwitterArchive, ingestTwitter } from "../sources/twitter"; import { @@ -193,7 +188,10 @@ async function main() { const validFormats = requested.filter((f) => allowedFormats.has(f)); const invalidFormats = requested.filter((f) => !allowedFormats.has(f)); for (const bad of invalidFormats) { - logger("warn", `Unknown format "${bad}". Supported: markdown, oai, json`); + logger( + "warn", + `Unknown format "${bad}". Supported: markdown, oai, json, sharegpt`, + ); } const jsonStdout = argv.includes("--json-stdout"); @@ -209,7 +207,7 @@ async function main() { if (formatSpecified && validFormats.length === 0) { logger( "error", - "No valid formats requested. Supported: markdown, oai, json", + "No valid formats requested. Supported: markdown, oai, json, sharegpt", ); process.stderr.write(usage() + "\n"); process.exit(2); diff --git a/src/transforms/core.ts b/src/transforms/core.ts index 97a66fc..e101b0e 100644 --- a/src/transforms/core.ts +++ b/src/transforms/core.ts @@ -1,4 +1,10 @@ -import { ContentItem, Thread, ChatMessage, Role, isRetweet } from "../core/types"; +import { + ContentItem, + Thread, + ChatMessage, + Role, + isRetweet, +} from "../core/types"; /** * Replace shortened URLs with expanded, strip t.co links, mentions, hashtags, @@ -34,7 +40,10 @@ export type FilterOptions = { * Apply stateless filters to a list of ContentItem. * Note: onlyThreads is intentionally ignored here; thread selection happens after grouping. */ -export function applyFilters(items: ContentItem[], opts: FilterOptions): ContentItem[] { +export function applyFilters( + items: ContentItem[], + opts: FilterOptions, +): ContentItem[] { const sinceTime = opts.since ? new Date(opts.since).getTime() : -Infinity; const untilTime = opts.until ? new Date(opts.until).getTime() : Infinity; @@ -42,7 +51,8 @@ export function applyFilters(items: ContentItem[], opts: FilterOptions): Content const t = new Date(it.createdAt).getTime(); if (!(t >= sinceTime && t <= untilTime)) return false; if (opts.excludeRt && isRetweet(it.text)) return false; - if (opts.minLength > 0 && (it.text?.trim().length ?? 0) < opts.minLength) return false; + if (opts.minLength > 0 && (it.text?.trim().length ?? 0) < opts.minLength) + return false; if (opts.withMedia && !(it.media && it.media.length > 0)) return false; return true; }); @@ -64,7 +74,9 @@ export function indexById(items: ContentItem[]): Record<string, ContentItem> { * Threads are chains where all items come from "twitter:tweet". * Conversations are chains which include other sources or likes. */ -export function groupThreadsAndConversations(all: Record<string, ContentItem>): { +export function groupThreadsAndConversations( + all: Record<string, ContentItem>, +): { threads: Thread[]; conversations: ContentItem[][]; } { @@ -105,6 +117,11 @@ export function groupThreadsAndConversations(all: Record<string, ContentItem>): * - Merge consecutive messages from the same role. * - Trim trailing user messages to end on assistant if possible. */ +export function inferRole(it: ContentItem): Role { + // Heuristic: tweets that look like assistant outputs (e.g., have full_text) are "assistant"; others are "user" + return it.raw && "full_text" in (it.raw as any) ? "assistant" : "user"; +} + export function messagesFromConversation(items: ContentItem[]): ChatMessage[] { const msgs: ChatMessage[] = []; let currentRole: Role | undefined; @@ -118,7 +135,7 @@ export function messagesFromConversation(items: ContentItem[]): ChatMessage[] { } for (const it of items) { - const role: Role = it.raw && "full_text" in (it.raw as any) ? "assistant" : "user"; + const role: Role = inferRole(it); const cleaned = cleanText(it.text, (it.raw as any)?.entities); if (!cleaned) continue;