From 69f10d03f970e58d5c683f83a826751c7365a3ef Mon Sep 17 00:00:00 2001
From: deepfates <deepfates@gmail.com>
Date: Sat, 18 Oct 2025 15:54:25 -0700
Subject: [PATCH 1/4] refactor: modular pipeline layout
 (sources/transforms/outputs/core/cli); preserve CLI behavior; add tsconfig;
 update build/bin

---
 package-lock.json      | 189 ++++++++++++++++++++++++-
 src/cli/splice.ts      | 268 +++++++++++++++++++++++++++++++++++
 src/core/types.ts      | 312 +++++++++++++++++++++++++++++++++++++++++
 src/outputs/writers.ts | 279 ++++++++++++++++++++++++++++++++++++
 src/sources/twitter.ts | 140 ++++++++++++++++++
 src/transforms/core.ts | 136 ++++++++++++++++++
 tsconfig.json          |  30 ++++
 7 files changed, 1347 insertions(+), 7 deletions(-)
 create mode 100644 src/cli/splice.ts
 create mode 100644 src/core/types.ts
 create mode 100644 src/outputs/writers.ts
 create mode 100644 src/sources/twitter.ts
 create mode 100644 src/transforms/core.ts
 create mode 100644 tsconfig.json

diff --git a/package-lock.json b/package-lock.json
index 2139d6a..e9b1e8b 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -1,15 +1,18 @@
 {
-  "name": "splice",
-  "version": "0.1.0",
+  "name": "@deepfates/splice",
+  "version": "0.1.1",
   "lockfileVersion": 3,
   "requires": true,
   "packages": {
     "": {
-      "name": "splice",
-      "version": "0.1.0",
+      "name": "@deepfates/splice",
+      "version": "0.1.1",
       "license": "MIT",
+      "dependencies": {
+        "cosmiconfig": "^9.0.0"
+      },
       "bin": {
-        "splice": "dist/splice.js"
+        "splice": "dist/cli/splice.js"
       },
       "devDependencies": {
         "@types/node": "^22.7.4",
@@ -22,6 +25,29 @@
         "node": ">=18"
       }
     },
+    "node_modules/@babel/code-frame": {
+      "version": "7.27.1",
+      "resolved": "https://registry.npmjs.org/@babel/code-frame/-/code-frame-7.27.1.tgz",
+      "integrity": "sha512-cjQ7ZlQ0Mv3b47hABuTevyTuYN4i+loJKGeV9flcCgIK37cCXRh+L1bd3iBHlynerhQ7BhCkn2BPbQUL+rGqFg==",
+      "license": "MIT",
+      "dependencies": {
+        "@babel/helper-validator-identifier": "^7.27.1",
+        "js-tokens": "^4.0.0",
+        "picocolors": "^1.1.1"
+      },
+      "engines": {
+        "node": ">=6.9.0"
+      }
+    },
+    "node_modules/@babel/helper-validator-identifier": {
+      "version": "7.27.1",
+      "resolved": "https://registry.npmjs.org/@babel/helper-validator-identifier/-/helper-validator-identifier-7.27.1.tgz",
+      "integrity": "sha512-D2hP9eA+Sqx1kBZgzxZh0y1trbuU+JoDkiEwqhQ36nodYqJwyEIhPSdMNd7lOm/4io72luTPWH20Yda0xOuUow==",
+      "license": "MIT",
+      "engines": {
+        "node": ">=6.9.0"
+      }
+    },
     "node_modules/@esbuild/aix-ppc64": {
       "version": "0.25.10",
       "resolved": "https://registry.npmjs.org/@esbuild/aix-ppc64/-/aix-ppc64-0.25.10.tgz",
@@ -929,6 +955,12 @@
         "url": "https://opencollective.com/vitest"
       }
     },
+    "node_modules/argparse": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/argparse/-/argparse-2.0.1.tgz",
+      "integrity": "sha512-8+9WqebbFzpX9OR+Wa6O29asIogeRMzcGtAINdpMHHyAg10f05aSFVBbcEqGf/PXw1EjAZ+q2/bEBg3DvurK3Q==",
+      "license": "Python-2.0"
+    },
     "node_modules/assertion-error": {
       "version": "2.0.1",
       "resolved": "https://registry.npmjs.org/assertion-error/-/assertion-error-2.0.1.tgz",
@@ -949,6 +981,15 @@
         "node": ">=8"
       }
     },
+    "node_modules/callsites": {
+      "version": "3.1.0",
+      "resolved": "https://registry.npmjs.org/callsites/-/callsites-3.1.0.tgz",
+      "integrity": "sha512-P8BjAsXvZS+VIDUI11hHCQEv74YT67YUi5JJFNWIqL235sBmjX4+qx9Muvls5ivyNENctx46xQLQ3aTuE7ssaQ==",
+      "license": "MIT",
+      "engines": {
+        "node": ">=6"
+      }
+    },
     "node_modules/chai": {
       "version": "5.3.3",
       "resolved": "https://registry.npmjs.org/chai/-/chai-5.3.3.tgz",
@@ -976,6 +1017,32 @@
         "node": ">= 16"
       }
     },
+    "node_modules/cosmiconfig": {
+      "version": "9.0.0",
+      "resolved": "https://registry.npmjs.org/cosmiconfig/-/cosmiconfig-9.0.0.tgz",
+      "integrity": "sha512-itvL5h8RETACmOTFc4UfIyB2RfEHi71Ax6E/PivVxq9NseKbOWpeyHEOIbmAw1rs8Ak0VursQNww7lf7YtUwzg==",
+      "license": "MIT",
+      "dependencies": {
+        "env-paths": "^2.2.1",
+        "import-fresh": "^3.3.0",
+        "js-yaml": "^4.1.0",
+        "parse-json": "^5.2.0"
+      },
+      "engines": {
+        "node": ">=14"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/d-fischer"
+      },
+      "peerDependencies": {
+        "typescript": ">=4.9.5"
+      },
+      "peerDependenciesMeta": {
+        "typescript": {
+          "optional": true
+        }
+      }
+    },
     "node_modules/cross-spawn": {
       "version": "7.0.6",
       "resolved": "https://registry.npmjs.org/cross-spawn/-/cross-spawn-7.0.6.tgz",
@@ -1019,6 +1086,24 @@
         "node": ">=6"
       }
     },
+    "node_modules/env-paths": {
+      "version": "2.2.1",
+      "resolved": "https://registry.npmjs.org/env-paths/-/env-paths-2.2.1.tgz",
+      "integrity": "sha512-+h1lkLKhZMTYjog1VEpJNG7NZJWcuc2DDk/qsqSTRRCOXiLjeQ1d1/udrUGhqMxUgAlwKNZ0cf2uqan5GLuS2A==",
+      "license": "MIT",
+      "engines": {
+        "node": ">=6"
+      }
+    },
+    "node_modules/error-ex": {
+      "version": "1.3.4",
+      "resolved": "https://registry.npmjs.org/error-ex/-/error-ex-1.3.4.tgz",
+      "integrity": "sha512-sqQamAnR14VgCr1A618A3sGrygcpK+HEbenA/HiEAkkUwcZIIB/tgWqHFxWgOyDh4nB4JCRimh79dR5Ywc9MDQ==",
+      "license": "MIT",
+      "dependencies": {
+        "is-arrayish": "^0.2.1"
+      }
+    },
     "node_modules/es-module-lexer": {
       "version": "1.7.0",
       "resolved": "https://registry.npmjs.org/es-module-lexer/-/es-module-lexer-1.7.0.tgz",
@@ -1186,6 +1271,28 @@
         "node": ">=18.18.0"
       }
     },
+    "node_modules/import-fresh": {
+      "version": "3.3.1",
+      "resolved": "https://registry.npmjs.org/import-fresh/-/import-fresh-3.3.1.tgz",
+      "integrity": "sha512-TR3KfrTZTYLPB6jUjfx6MF9WcWrHL9su5TObK4ZkYgBdWKPOFoSoQIdEuTuR82pmtxH2spWG9h6etwfr1pLBqQ==",
+      "license": "MIT",
+      "dependencies": {
+        "parent-module": "^1.0.0",
+        "resolve-from": "^4.0.0"
+      },
+      "engines": {
+        "node": ">=6"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sindresorhus"
+      }
+    },
+    "node_modules/is-arrayish": {
+      "version": "0.2.1",
+      "resolved": "https://registry.npmjs.org/is-arrayish/-/is-arrayish-0.2.1.tgz",
+      "integrity": "sha512-zz06S8t0ozoDXMG+ube26zeCTNXcKIPJZJi8hBrF4idCLms4CG9QtK7qBl1boi5ODzFpjswb5JPmHCbMpjaYzg==",
+      "license": "MIT"
+    },
     "node_modules/is-plain-obj": {
       "version": "4.1.0",
       "resolved": "https://registry.npmjs.org/is-plain-obj/-/is-plain-obj-4.1.0.tgz",
@@ -1232,6 +1339,36 @@
       "dev": true,
       "license": "ISC"
     },
+    "node_modules/js-tokens": {
+      "version": "4.0.0",
+      "resolved": "https://registry.npmjs.org/js-tokens/-/js-tokens-4.0.0.tgz",
+      "integrity": "sha512-RdJUflcE3cUzKiMqQgsCu06FPu9UdIJO0beYbPhHN4k6apgJtifcoCtT9bcxOpYBtpD2kCM6Sbzg4CausW/PKQ==",
+      "license": "MIT"
+    },
+    "node_modules/js-yaml": {
+      "version": "4.1.0",
+      "resolved": "https://registry.npmjs.org/js-yaml/-/js-yaml-4.1.0.tgz",
+      "integrity": "sha512-wpxZs9NoxZaJESJGIZTyDEaYpl0FKSA+FB9aJiyemKhMwkxQg63h4T1KJgUGHpTqPDNRcmmYLugrRjJlBtWvRA==",
+      "license": "MIT",
+      "dependencies": {
+        "argparse": "^2.0.1"
+      },
+      "bin": {
+        "js-yaml": "bin/js-yaml.js"
+      }
+    },
+    "node_modules/json-parse-even-better-errors": {
+      "version": "2.3.1",
+      "resolved": "https://registry.npmjs.org/json-parse-even-better-errors/-/json-parse-even-better-errors-2.3.1.tgz",
+      "integrity": "sha512-xyFwyhro/JEof6Ghe2iz2NcXoj2sloNsWr/XsERDK/oiPCfaNhl5ONfp+jQdAZRQQ0IJWNzH9zIZF7li91kh2w==",
+      "license": "MIT"
+    },
+    "node_modules/lines-and-columns": {
+      "version": "1.2.4",
+      "resolved": "https://registry.npmjs.org/lines-and-columns/-/lines-and-columns-1.2.4.tgz",
+      "integrity": "sha512-7ylylesZQ/PV29jhEDl3Ufjo6ZX7gCqJr5F7PKrqc93v7fzSymt1BpwEU8nAUXs8qzzvqhbjhK5QZg6Mt/HkBg==",
+      "license": "MIT"
+    },
     "node_modules/loupe": {
       "version": "3.2.1",
       "resolved": "https://registry.npmjs.org/loupe/-/loupe-3.2.1.tgz",
@@ -1305,6 +1442,36 @@
         "url": "https://github.com/sponsors/sindresorhus"
       }
     },
+    "node_modules/parent-module": {
+      "version": "1.0.1",
+      "resolved": "https://registry.npmjs.org/parent-module/-/parent-module-1.0.1.tgz",
+      "integrity": "sha512-GQ2EWRpQV8/o+Aw8YqtfZZPfNRWZYkbidE9k5rpl/hC3vtHHBfGm2Ifi6qWV+coDGkrUKZAxE3Lot5kcsRlh+g==",
+      "license": "MIT",
+      "dependencies": {
+        "callsites": "^3.0.0"
+      },
+      "engines": {
+        "node": ">=6"
+      }
+    },
+    "node_modules/parse-json": {
+      "version": "5.2.0",
+      "resolved": "https://registry.npmjs.org/parse-json/-/parse-json-5.2.0.tgz",
+      "integrity": "sha512-ayCKvm/phCGxOkYRSCM82iDwct8/EonSEgCSxWxD7ve6jHggsFl4fZVQBPRNgQoKiuV/odhFrGzQXZwbifC8Rg==",
+      "license": "MIT",
+      "dependencies": {
+        "@babel/code-frame": "^7.0.0",
+        "error-ex": "^1.3.1",
+        "json-parse-even-better-errors": "^2.3.0",
+        "lines-and-columns": "^1.1.6"
+      },
+      "engines": {
+        "node": ">=8"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sindresorhus"
+      }
+    },
     "node_modules/parse-ms": {
       "version": "4.0.0",
       "resolved": "https://registry.npmjs.org/parse-ms/-/parse-ms-4.0.0.tgz",
@@ -1349,7 +1516,6 @@
       "version": "1.1.1",
       "resolved": "https://registry.npmjs.org/picocolors/-/picocolors-1.1.1.tgz",
       "integrity": "sha512-xceH2snhtb5M9liqDsmEw56le376mTZkEX/jEb/RxNFyegNul7eNslCXP9FDj/Lcu0X8KEyMceP2ntpaHrDEVA==",
-      "dev": true,
       "license": "ISC"
     },
     "node_modules/postcss": {
@@ -1397,6 +1563,15 @@
         "url": "https://github.com/sponsors/sindresorhus"
       }
     },
+    "node_modules/resolve-from": {
+      "version": "4.0.0",
+      "resolved": "https://registry.npmjs.org/resolve-from/-/resolve-from-4.0.0.tgz",
+      "integrity": "sha512-pb/MYmXstAkysRFx8piNI1tGFNQIFA3vkE3Gq4EuA1dF6gHp/+vgZqsCGJapvy8N3Q+4o7FwvquPJcnZ7RYy4g==",
+      "license": "MIT",
+      "engines": {
+        "node": ">=4"
+      }
+    },
     "node_modules/resolve-pkg-maps": {
       "version": "1.0.0",
       "resolved": "https://registry.npmjs.org/resolve-pkg-maps/-/resolve-pkg-maps-1.0.0.tgz",
@@ -1597,7 +1772,7 @@
       "version": "5.9.3",
       "resolved": "https://registry.npmjs.org/typescript/-/typescript-5.9.3.tgz",
       "integrity": "sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw==",
-      "dev": true,
+      "devOptional": true,
       "license": "Apache-2.0",
       "bin": {
         "tsc": "bin/tsc",
diff --git a/src/cli/splice.ts b/src/cli/splice.ts
new file mode 100644
index 0000000..f77e023
--- /dev/null
+++ b/src/cli/splice.ts
@@ -0,0 +1,268 @@
+#!/usr/bin/env -S tsx
+/**
+ * splice — CLI entrypoint
+ * Wires sources → transforms → outputs using modular architecture.
+ *
+ * Maintains existing flags/behavior from the original monolithic script.
+ */
+
+import * as path from "node:path";
+import * as fs from "node:fs/promises";
+import { fileURLToPath } from "node:url";
+
+import {
+  CLIOptions,
+  parseArgs,
+  makeLogger,
+  usage,
+} from "../core/types";
+
+import { detectTwitterArchive, ingestTwitter } from "../sources/twitter";
+import {
+  applyFilters,
+  indexById,
+  groupThreadsAndConversations,
+} from "../transforms/core";
+import {
+  writeMarkdown,
+  writeOAI,
+  writeNormalizedJSONL,
+  writeShareGPT,
+  writeStatsJSON,
+} from "../outputs/writers";
+
+/* -------------------------------- version -------------------------------- */
+
+async function getVersion(): Promise<string> {
+  try {
+    const thisFile = fileURLToPath(import.meta.url);
+    const dir = path.dirname(thisFile);
+    // src/cli/splice.ts -> ../../package.json
+    // dist/cli/splice.js -> ../../package.json
+    const pkgPath = path.join(dir, "..", "..", "package.json");
+    const raw = await fs.readFile(pkgPath, "utf8");
+    const pkg = JSON.parse(raw);
+    return typeof pkg.version === "string" ? pkg.version : "0.0.0";
+  } catch {
+    return "0.0.0";
+  }
+}
+
+/* ---------------------------------- main ---------------------------------- */
+
+async function main() {
+  const opts: CLIOptions = parseArgs(process.argv);
+  if (opts.help) {
+    process.stderr.write(usage() + "\n");
+    process.exit(0);
+  }
+  if (opts.version) {
+    const v = await getVersion();
+    process.stdout.write(`splice ${v}\n`);
+    process.exit(0);
+  }
+
+  // Allow quick verbosity shorthands unless an explicit --log-level was provided
+  {
+    const argv = process.argv.slice(2);
+    const hasExplicitLogLevel = argv.includes("--log-level");
+    const wantsQuiet = argv.includes("--quiet") || argv.includes("-q");
+    const wantsVerbose = argv.includes("--verbose");
+    if (!hasExplicitLogLevel) {
+      if (wantsQuiet) (opts as any).logLevel = "error";
+      else if (wantsVerbose) (opts as any).logLevel = "debug";
+    }
+  }
+
+  const logger = makeLogger(opts.logLevel);
+
+  // Warn on unknown flags with a simple suggestion
+  {
+    const argv = process.argv.slice(2);
+    const known = new Set([
+      "--help",
+      "-h",
+      "--version",
+      "-V",
+      "--source",
+      "--archive-path",
+      "--out",
+      "--output-dir",
+      "--format",
+      "--formats",
+      "--output-formats",
+      "--system-message",
+      "--system",
+      "--dry-run",
+      "-n",
+      "--log-level",
+      "--quiet",
+      "-q",
+      "--verbose",
+      "--json-stdout",
+      "--since",
+      "--until",
+      "--min-length",
+      "--exclude-rt",
+      "--only-threads",
+      "--with-media",
+      "--stats-json",
+      "--",
+    ]);
+    const unknown = argv.filter(
+      (a) => a.startsWith("-") && !known.has(a) && a !== "-" && a !== "--",
+    );
+    const candidates = Array.from(known).filter((f) => f.startsWith("--"));
+    const suggest = (flag: string): string | null => {
+      let best: string | null = null;
+      let score = -1;
+      for (const c of candidates) {
+        // simple common prefix score
+        let s = 0;
+        const L = Math.min(flag.length, c.length);
+        for (let i = 0; i < L; i++) {
+          if (flag[i] === c[i]) s++;
+          else break;
+        }
+        if (s > score) {
+          score = s;
+          best = c;
+        }
+      }
+      return score >= 2 ? best : null;
+    };
+    for (const uf of unknown) {
+      const hint = suggest(uf);
+      if (hint) logger("warn", `Unknown flag ${uf}. Did you mean ${hint}?`);
+      else
+        logger(
+          "warn",
+          `Unknown flag ${uf}. Run with --help to see supported flags.`,
+        );
+    }
+  }
+
+  if (!opts.source || !opts.out) {
+    process.stderr.write(usage() + "\n");
+    process.exit(2);
+  }
+
+  const source = path.resolve(opts.source);
+  const outDir = path.resolve(opts.out);
+
+  const detected = await detectTwitterArchive(source);
+  if (!detected) {
+    logger(
+      "error",
+      `Could not detect a Twitter archive at ${source} (missing data/manifest.js)`,
+    );
+    process.exit(2);
+  }
+
+  try {
+    logger("info", `Ingesting from ${source}`);
+    const items = await ingestTwitter(source, logger);
+
+    const filtered = applyFilters(items, {
+      since: opts.since,
+      until: opts.until,
+      minLength: opts.minLength,
+      excludeRt: opts.excludeRt,
+      onlyThreads: opts.onlyThreads,
+      withMedia: opts.withMedia,
+    });
+
+    const all = indexById(filtered);
+    let { threads, conversations } = groupThreadsAndConversations(all);
+    if (opts.onlyThreads) {
+      conversations = [];
+    }
+    logger(
+      "info",
+      `Threads: ${threads.length}, Conversations: ${conversations.length}`,
+    );
+
+    // Validate formats and support --json-stdout for piping normalized items
+    const argv = process.argv.slice(2);
+    const formatSpecified =
+      argv.includes("--format") ||
+      argv.includes("--formats") ||
+      argv.includes("--output-formats");
+    const allowedFormats = new Set(["markdown", "oai", "json", "sharegpt"]);
+    const requested = opts.format || [];
+    const validFormats = requested.filter((f) => allowedFormats.has(f));
+    const invalidFormats = requested.filter((f) => !allowedFormats.has(f));
+    for (const bad of invalidFormats) {
+      logger("warn", `Unknown format "${bad}". Supported: markdown, oai, json`);
+    }
+    const jsonStdout = argv.includes("--json-stdout");
+
+    if (jsonStdout) {
+      // Print normalized items as JSONL to stdout; logs remain on stderr
+      for (const it of items) {
+        process.stdout.write(JSON.stringify(it) + "\n");
+      }
+      logger("info", "Wrote normalized items to stdout");
+      process.exit(0);
+    }
+
+    if (formatSpecified && validFormats.length === 0) {
+      logger(
+        "error",
+        "No valid formats requested. Supported: markdown, oai, json",
+      );
+      process.stderr.write(usage() + "\n");
+      process.exit(2);
+    }
+
+    if (validFormats.includes("markdown")) {
+      await writeMarkdown(
+        threads,
+        opts.onlyThreads ? [] : filtered,
+        outDir,
+        logger,
+        opts.dryRun,
+      );
+    }
+    if (validFormats.includes("json")) {
+      await writeNormalizedJSONL(items, outDir, logger, opts.dryRun);
+    }
+    const systemMessage =
+      process.env.SPLICE_SYSTEM_MESSAGE ?? opts.systemMessage;
+    logger("debug", `System message: ${systemMessage}`);
+    if (validFormats.includes("oai")) {
+      await writeOAI(
+        threads,
+        conversations,
+        outDir,
+        systemMessage,
+        logger,
+        opts.dryRun,
+      );
+    }
+    if (validFormats.includes("sharegpt")) {
+      await writeShareGPT(threads, conversations, outDir, logger, opts.dryRun);
+    }
+    if (opts.statsJson) {
+      await writeStatsJSON(
+        filtered,
+        threads,
+        conversations,
+        outDir,
+        logger,
+        opts.dryRun,
+      );
+    }
+
+    logger("info", opts.dryRun ? "Dry run complete." : "Done.");
+    process.exit(0);
+  } catch (e) {
+    logger("error", (e as Error).message);
+    process.exit(1);
+  }
+}
+
+main().catch((err) => {
+  process.stderr.write(`[error] ${(err as Error).message}\n`);
+  process.exit(1);
+});
diff --git a/src/core/types.ts b/src/core/types.ts
new file mode 100644
index 0000000..647bf73
--- /dev/null
+++ b/src/core/types.ts
@@ -0,0 +1,312 @@
+/**
+ * Core types, CLI args, logger, and shared utilities.
+ * Extracted to support a modular pipeline architecture.
+ */
+
+import * as fs from "node:fs/promises";
+import * as path from "node:path";
+
+/* --------------------------------- Types --------------------------------- */
+
+export type Level = "debug" | "info" | "warn" | "error";
+
+export type SourceId = "twitter:tweet" | "twitter:like" | string;
+
+export interface MediaAttachment {
+  id: string;
+  contentType: "photo" | "video" | "unknown";
+  absPath?: string; // local absolute path if available
+  url?: string; // remote URL if available
+  metadata?: Record<string, unknown>;
+}
+
+export interface ContentItem {
+  id: string;
+  text: string;
+  createdAt: string; // ISO-8601
+  parentId?: string | null;
+  source: SourceId;
+  raw?: Record<string, unknown>;
+  media?: MediaAttachment[];
+  annotations?: Record<string, unknown>;
+}
+
+export interface Thread {
+  id: string;
+  items: ContentItem[]; // ordered oldest → newest
+}
+
+export type Role = "assistant" | "user";
+
+export interface ChatMessage {
+  role: Role;
+  content: string;
+}
+
+/* -------------------------------- Logger --------------------------------- */
+
+export function makeLogger(level: Level): (lvl: Level, msg: string) => void {
+  const order: Level[] = ["debug", "info", "warn", "error"];
+  const minIdx = order.indexOf(level);
+  return (lvl: Level, msg: string) => {
+    if (order.indexOf(lvl) >= minIdx) {
+      process.stderr.write(`[${lvl}] ${msg}\n`);
+    }
+  };
+}
+
+/* --------------------------------- Args ---------------------------------- */
+
+export type CLIOptions = {
+  source?: string;
+  out?: string;
+  format: string[]; // e.g. ['markdown','oai']
+  systemMessage: string;
+  dryRun: boolean;
+  logLevel: Level;
+  help: boolean;
+  version: boolean;
+  // filters
+  since?: string;
+  until?: string;
+  minLength: number;
+  excludeRt: boolean;
+  onlyThreads: boolean;
+  withMedia: boolean;
+  // outputs
+  statsJson: boolean;
+};
+
+export const DEFAULT_SYSTEM_MESSAGE =
+  "You have been uploaded to the internet";
+
+export function parseArgs(argv: string[]): CLIOptions {
+  const opts: CLIOptions = {
+    format: ["markdown", "oai"],
+    systemMessage: DEFAULT_SYSTEM_MESSAGE,
+    dryRun: false,
+    logLevel: "info",
+    help: false,
+    version: false,
+    since: undefined,
+    until: undefined,
+    minLength: 0,
+    excludeRt: false,
+    onlyThreads: false,
+    withMedia: false,
+    statsJson: false,
+  };
+
+  const args = argv.slice(2);
+  let systemExplicit = false;
+  for (let i = 0; i < args.length; i++) {
+    const a = args[i];
+    if (a === "--help" || a === "-h") {
+      opts.help = true;
+    } else if (a === "--version" || a === "-V") {
+      opts.version = true;
+    } else if (a === "--source" || a === "--archive-path") {
+      opts.source = args[++i];
+    } else if (a === "--out" || a === "--output-dir") {
+      opts.out = args[++i];
+    } else if (
+      a === "--format" ||
+      a === "--formats" ||
+      a === "--output-formats"
+    ) {
+      const next = args[++i];
+      if (!next) continue;
+      // allow space or comma separated
+      const parts = next.split(",").filter(Boolean);
+      if (parts.length > 1) opts.format = parts;
+      else {
+        // collect following non-flag tokens too (space-separated list)
+        const list = [next];
+        while (args[i + 1] && !args[i + 1].startsWith("-")) {
+          list.push(args[++i]);
+        }
+        opts.format = list;
+      }
+    } else if (a === "--system-message" || a === "--system") {
+      const val = args[++i];
+      if (val) {
+        opts.systemMessage = val;
+        systemExplicit = true;
+      }
+    } else if (a === "--dry-run" || a === "-n") {
+      opts.dryRun = true;
+    } else if (a === "--log-level") {
+      const lvl = (args[++i] ?? "").toLowerCase();
+      if (
+        lvl === "debug" ||
+        lvl === "info" ||
+        lvl === "warn" ||
+        lvl === "error"
+      ) {
+        opts.logLevel = lvl;
+      }
+    } else if (a === "--since") {
+      opts.since = args[++i];
+    } else if (a === "--until") {
+      opts.until = args[++i];
+    } else if (a === "--min-length") {
+      const v = parseInt(args[++i] ?? "", 10);
+      if (!Number.isNaN(v)) opts.minLength = v;
+    } else if (a === "--exclude-rt") {
+      opts.excludeRt = true;
+    } else if (a === "--only-threads") {
+      opts.onlyThreads = true;
+    } else if (a === "--with-media") {
+      opts.withMedia = true;
+    } else if (a === "--stats-json") {
+      opts.statsJson = true;
+    } else if (a === "--") {
+      break;
+    } else if (a.startsWith("-")) {
+      // unknown flag; ignore to keep simple (CLI warns elsewhere)
+    } else {
+      // positional? ignore for now
+    }
+  }
+  if (!systemExplicit && process.env.SPLICE_SYSTEM_MESSAGE) {
+    opts.systemMessage = process.env.SPLICE_SYSTEM_MESSAGE as string;
+  }
+  return opts;
+}
+
+export function usage(): string {
+  return [
+    "splice — convert a Twitter archive to Markdown, OAI JSONL, and/or JSON",
+    "",
+    "Usage:",
+    "  splice --source <path> --out <dir> [--format markdown oai json sharegpt] [--system-message <text>]",
+    "         [--since <iso>] [--until <iso>] [--min-length <n>] [--exclude-rt] [--only-threads] [--with-media]",
+    "         [--dry-run] [--stats-json] [--log-level <level>] [--json-stdout] [--quiet|-q] [--verbose] [--version|-V]",
+    "",
+    "Options:",
+    "  --source <path>            Path to the Twitter archive directory",
+    "  --out <dir>                Output directory",
+    "  --format <fmt...>          One or more formats: markdown, oai, json, sharegpt (default: markdown oai)",
+    '  --system, --system-message <text>    System message for OAI JSONL (default: "You have been uploaded to the internet")',
+    "  --since <iso>              Include items on/after this ISO date",
+    "  --until <iso>              Include items on/before this ISO date",
+    "  --min-length <n>           Minimum text length",
+    "  --exclude-rt               Exclude retweets (RT ...)",
+    "  --only-threads             Output threads only (ignore conversations/non-thread tweets)",
+    "  --with-media               Only include items that have media",
+    "  --dry-run, -n              Plan only; don’t write files",
+    "  --stats-json               Write a stats.json summary",
+    "  --log-level <level>        debug|info|warn|error (default: info)",
+    "  --json-stdout              Emit normalized items JSONL to stdout (no files); logs to stderr",
+    "  --quiet, -q                Errors only",
+    "  --verbose                  Debug logging",
+    "  --version, -V              Show version",
+    "  --help, -h                 Show help",
+    "",
+    "Examples:",
+    "  splice --source ./archive --out ./out --format markdown oai json",
+    '  splice --source ./archive --out ./out --format oai --system-message "You are helpful."',
+    "  splice --source ./archive --out ./out --since 2024-01-01 --only-threads",
+    "  splice --source ./archive --out ./out --json-stdout",
+    "  splice --version",
+    "",
+    "Docs: https://github.com/deepfates/splice • Context: https://deepfates.com/convert-your-twitter-archive-into-training-data",
+  ].join("\n");
+}
+
+/* --------------------------------- Utils --------------------------------- */
+
+export function cleanJsonString(js: string): string {
+  // remove window.* = prefix and trailing semicolon
+  return js
+    .trim()
+    .replace(/^window\.[^=]+=\s*/i, "")
+    .replace(/;?\s*$/, "");
+}
+
+export async function readJsonFromJs(filePath: string): Promise<any> {
+  const raw = await fs.readFile(filePath, "utf8");
+  const cleaned = cleanJsonString(raw);
+  try {
+    return JSON.parse(cleaned);
+  } catch {
+    // try __THAR_CONFIG fallback
+    const match = raw.match(/window\.__THAR_CONFIG\s*=\s*({[\s\S]*?})\s*;?/);
+    if (match) return JSON.parse(match[1]);
+    throw new Error(`Could not parse JSON from ${filePath}`);
+  }
+}
+
+/**
+ * Accepts strict JSON arrays or loose JS array/object literals.
+ * Returns [] on failure.
+ */
+export function parseLooseArray(input: string): any[] {
+  // Try strict JSON first
+  try {
+    const parsed = JSON.parse(input);
+    return Array.isArray(parsed) ? parsed : [];
+  } catch {
+    // Fall through to loose JS evaluation
+  }
+
+  // Attempt to evaluate as a JS array/object literal in a confined context.
+  // cleanJsonString should have removed any "window.* = " prefix so input should be an array expression.
+  try {
+    // eslint-disable-next-line no-new-func
+    const fn = new Function('"use strict"; return (' + input + ");");
+    const result = fn();
+    return Array.isArray(result) ? result : [];
+  } catch {
+    return [];
+  }
+}
+
+export async function loadConfig(): Promise<any | undefined> {
+  try {
+    const mod: any = await import("cosmiconfig");
+    const explorer = mod.cosmiconfig("splice");
+    const result = await explorer.search();
+    return result?.config;
+  } catch {
+    return undefined;
+  }
+}
+
+export function mediaTypeFromExt(
+  filename: string,
+): "photo" | "video" | "unknown" {
+  const ext = path.extname(filename).toLowerCase();
+  if (ext === ".mp4" || ext === ".mov") return "video";
+  if (ext === ".jpg" || ext === ".jpeg" || ext === ".png" || ext === ".gif")
+    return "photo";
+  return "unknown";
+}
+
+export function sanitizeFilename(name: string, maxLen = 50): string {
+  return (
+    name
+      .replace(/[^\w\-_ ]/g, "")
+      .trim()
+      .replace(/\s+/g, "_")
+      .slice(0, maxLen) || "untitled"
+  );
+}
+
+export function toIso(d: string | Date): string {
+  const dt = typeof d === "string" ? new Date(d) : d;
+  return Number.isNaN(dt.getTime())
+    ? new Date().toISOString()
+    : dt.toISOString();
+}
+
+export function isRetweet(text: string): boolean {
+  return /^RT\b/.test(text || "");
+}
+
+export function formatIsoDateOnly(iso: string): string {
+  const d = new Date(iso);
+  return isNaN(d.getTime())
+    ? new Date().toISOString().slice(0, 10)
+    : d.toISOString().slice(0, 10);
+}
diff --git a/src/outputs/writers.ts b/src/outputs/writers.ts
new file mode 100644
index 0000000..33358ad
--- /dev/null
+++ b/src/outputs/writers.ts
@@ -0,0 +1,279 @@
+import * as fs from "node:fs/promises";
+import * as path from "node:path";
+import {
+  ContentItem,
+  Thread,
+  Level,
+  formatIsoDateOnly,
+  sanitizeFilename,
+  isRetweet,
+} from "../core/types";
+import { cleanText, messagesFromConversation } from "../transforms/core";
+
+/**
+ * Ensure a directory exists (mkdir -p).
+ */
+async function ensureDir(p: string) {
+  await fs.mkdir(p, { recursive: true });
+}
+
+/**
+ * Copy media attachments for a set of items into imagesDir, prefixing names with "_".
+ * If an attachment lacks absPath, it will be skipped with a warning.
+ */
+async function copyMedia(
+  items: ContentItem[],
+  imagesDir: string,
+  logger: (l: Level, m: string) => void,
+) {
+  await ensureDir(imagesDir);
+  for (const it of items) {
+    for (const m of it.media ?? []) {
+      const base = m.absPath ? path.basename(m.absPath) : `${m.id}.bin`;
+      try {
+        if (!m.absPath) {
+          logger("warn", `No absPath for media ${m.id}; skipping copy`);
+          continue;
+        }
+        await fs.copyFile(m.absPath, path.join(imagesDir, `_${base}`));
+      } catch (e) {
+        logger(
+          "warn",
+          `Failed to copy media ${m.absPath ?? m.id}: ${(e as Error).message}`,
+        );
+      }
+    }
+  }
+}
+
+/**
+ * Write Markdown outputs:
+ * - threads/<title>.md with frontmatter, cleaned text, media links, and link to Twitter
+ * - tweets_by_date/<YYYY-MM-DD>.md for non-thread tweets (excluding RTs)
+ * - images/_<file> copied for referenced items
+ */
+export async function writeMarkdown(
+  threads: Thread[],
+  items: ContentItem[],
+  outDir: string,
+  logger: (l: Level, m: string) => void,
+  dryRun: boolean,
+) {
+  const threadsDir = path.join(outDir, "threads");
+  const byDateDir = path.join(outDir, "tweets_by_date");
+  const imagesDir = path.join(outDir, "images");
+
+  if (!dryRun) {
+    await ensureDir(threadsDir);
+    await ensureDir(byDateDir);
+    await ensureDir(imagesDir);
+  }
+
+  // Copy media for all thread items + non-thread tweets
+  const threadItems = threads.flatMap((t) => t.items);
+  const threadIds = new Set(threadItems.map((i) => i.id));
+  const nonThreadTweets = items.filter(
+    (i) =>
+      i.source === "twitter:tweet" &&
+      !i.parentId &&
+      !threadIds.has(i.id) &&
+      !isRetweet(i.text),
+  );
+  const copyPool = threadItems.concat(nonThreadTweets);
+
+  logger("info", `Preparing media for ${copyPool.length} items`);
+  if (!dryRun) await copyMedia(copyPool, imagesDir, logger);
+
+  // Save threads
+  logger("info", `Saving ${threads.length} threads`);
+  for (const thread of threads) {
+    const first = thread.items[0];
+    const date = formatIsoDateOnly(first.createdAt);
+    const fm = `---\nDate: ${date}\n---\n`;
+
+    const parts: string[] = [];
+    for (const t of thread.items) {
+      const mediaLinks = (t.media ?? []).map((m) => {
+        const base = m.absPath ? path.basename(m.absPath) : `${m.id}.bin`;
+        return `![${base}](../images/_${base})`;
+      });
+      const cleaned = cleanText(t.text, (t.raw as any)?.entities);
+      parts.push(`${cleaned}\n\n${mediaLinks.join("\n")}`.trim());
+    }
+
+    const firstWords = thread.items[0].text.split(/\s+/).slice(0, 5).join(" ");
+    const name = sanitizeFilename(firstWords) || thread.id;
+    const filePath = path.join(threadsDir, `${name}.md`);
+    const topLink = `https://twitter.com/i/web/status/${first.id}`;
+    const body = `${fm}\n${parts.join("\n\n")}\n\n[View on Twitter](${topLink})`;
+
+    if (dryRun) {
+      logger("info", `(dry-run) would write thread file: ${filePath}`);
+    } else {
+      await fs.writeFile(filePath, body, "utf8");
+    }
+  }
+
+  // Save non-thread tweets by date
+  const byDate: Record<string, ContentItem[]> = {};
+  for (const t of nonThreadTweets) {
+    const d = formatIsoDateOnly(t.createdAt);
+    (byDate[d] ||= []).push(t);
+  }
+
+  for (const [date, dayItems] of Object.entries(byDate)) {
+    dayItems.sort((a, b) => a.createdAt.localeCompare(b.createdAt));
+    const content = dayItems
+      .map((t) => {
+        const dt = new Date(t.createdAt);
+        const time = isNaN(dt.getTime())
+          ? ""
+          : dt.toLocaleTimeString("en-US", {
+              hour: "numeric",
+              minute: "2-digit",
+            });
+        const images = (t.media ?? [])
+          .map((m) => {
+            const base = m.absPath ? path.basename(m.absPath) : `${m.id}.bin`;
+            return `![${base}](../images/_${base})`;
+          })
+          .join("");
+        const cleaned = cleanText(t.text, (t.raw as any)?.entities);
+        return `*${time}*  \n${cleaned}${images}`;
+      })
+      .join("\n\n---\n\n");
+
+    const filePath = path.join(byDateDir, `${date}.md`);
+    if (dryRun) {
+      logger("info", `(dry-run) would write daily file: ${filePath}`);
+    } else {
+      await fs.writeFile(filePath, content, "utf8");
+    }
+  }
+}
+
+/**
+ * Write conversations in OpenAI JSONL format.
+ * Note: Includes a system message at the top of each conversation.
+ */
+export async function writeOAI(
+  threads: Thread[],
+  conversations: ContentItem[][],
+  outDir: string,
+  systemMessage: string,
+  logger: (l: Level, m: string) => void,
+  dryRun: boolean,
+) {
+  const outPath = path.join(outDir, "conversations_oai.jsonl");
+  if (dryRun) {
+    logger("info", `(dry-run) would write OAI JSONL: ${outPath}`);
+    return;
+  }
+  await ensureDir(path.dirname(outPath));
+  const fh = await fs.open(outPath, "w");
+
+  const writeConv = async (items: ContentItem[]) => {
+    const msgs = messagesFromConversation(items);
+    if (!msgs.length) return;
+    const record = {
+      messages: [{ role: "system", content: systemMessage }, ...msgs],
+    };
+    await fh.write(JSON.stringify(record) + "\n");
+  };
+
+  for (const t of threads) await writeConv(t.items);
+  for (const c of conversations) await writeConv(c);
+  await fh.close();
+  logger("info", `Wrote OAI JSONL to ${outPath}`);
+}
+
+/**
+ * Write the normalized ContentItem stream as JSONL for downstream reuse.
+ */
+export async function writeNormalizedJSONL(
+  items: ContentItem[],
+  outDir: string,
+  logger: (l: Level, m: string) => void,
+  dryRun: boolean,
+) {
+  const outPath = path.join(outDir, "normalized_items.jsonl");
+  if (dryRun) {
+    logger("info", `(dry-run) would write normalized items JSONL: ${outPath}`);
+    return;
+  }
+  await ensureDir(path.dirname(outPath));
+  const fh = await fs.open(outPath, "w");
+  for (const it of items) {
+    await fh.write(JSON.stringify(it) + "\n");
+  }
+  await fh.close();
+  logger("info", `Wrote normalized items JSONL to ${outPath}`);
+}
+
+/**
+ * Write ShareGPT JSON format from conversations derived from threads and mixed conversations.
+ */
+export async function writeShareGPT(
+  threads: Thread[],
+  conversations: ContentItem[][],
+  outDir: string,
+  logger: (l: Level, m: string) => void,
+  dryRun: boolean,
+) {
+  const outPath = path.join(outDir, "sharegpt.json");
+  if (dryRun) {
+    logger("info", `(dry-run) would write ShareGPT JSON: ${outPath}`);
+    return;
+  }
+  await ensureDir(path.dirname(outPath));
+  const list: Array<{ conversations: Array<{ from: string; value: string }> }> =
+    [];
+  const addConv = async (items: ContentItem[]) => {
+    const msgs = messagesFromConversation(items);
+    if (!msgs.length) return;
+    list.push({
+      conversations: msgs.map((m) => ({
+        from: m.role === "user" ? "human" : "gpt",
+        value: m.content,
+      })),
+    });
+  };
+  for (const t of threads) await addConv(t.items);
+  for (const c of conversations) await addConv(c);
+  await fs.writeFile(outPath, JSON.stringify(list, null, 2), "utf8");
+  logger("info", `Wrote ShareGPT JSON to ${outPath}`);
+}
+
+/**
+ * Write a small stats.json summary about items, threads, conversations, and date range.
+ */
+export async function writeStatsJSON(
+  items: ContentItem[],
+  threads: Thread[],
+  conversations: ContentItem[][],
+  outDir: string,
+  logger: (l: Level, m: string) => void,
+  dryRun: boolean,
+) {
+  const outPath = path.join(outDir, "stats.json");
+  const dates = items
+    .map((i) => new Date(i.createdAt).toISOString())
+    .filter(Boolean);
+  const start = dates.length ? dates.reduce((a, b) => (a < b ? a : b)) : null;
+  const end = dates.length ? dates.reduce((a, b) => (a > b ? a : b)) : null;
+  const stats = {
+    totalItems: items.length,
+    tweets: items.filter((i) => i.source === "twitter:tweet").length,
+    likes: items.filter((i) => i.source === "twitter:like").length,
+    threads: threads.length,
+    conversations: conversations.length,
+    dateRange: { start, end },
+  };
+  if (dryRun) {
+    logger("info", `(dry-run) would write stats JSON: ${outPath}`);
+    return;
+  }
+  await ensureDir(path.dirname(outPath));
+  await fs.writeFile(outPath, JSON.stringify(stats, null, 2), "utf8");
+  logger("info", `Wrote stats JSON to ${outPath}`);
+}
diff --git a/src/sources/twitter.ts b/src/sources/twitter.ts
new file mode 100644
index 0000000..9ec5e12
--- /dev/null
+++ b/src/sources/twitter.ts
@@ -0,0 +1,140 @@
+import * as fs from "node:fs/promises";
+import * as path from "node:path";
+import {
+  Level,
+  ContentItem,
+  MediaAttachment,
+  readJsonFromJs,
+  parseLooseArray,
+  mediaTypeFromExt,
+  toIso,
+  cleanJsonString,
+} from "../core/types";
+
+/**
+ * Subset of the Twitter/X archive manifest schema
+ */
+type Manifest = {
+  dataTypes?: Record<string, { files?: Array<{ fileName: string }> }>;
+};
+
+/**
+ * Detect whether a directory looks like a Twitter/X archive by checking for data/manifest.js
+ */
+export async function detectTwitterArchive(rootPath: string): Promise<boolean> {
+  try {
+    const p = path.join(rootPath, "data", "manifest.js");
+    await fs.stat(p);
+    return true;
+  } catch {
+    return false;
+  }
+}
+
+/**
+ * Return media file basenames for a given tweet id.
+ * Filters out zero-byte files to avoid broken copies.
+ */
+async function getMediaFiles(root: string, id: string): Promise<string[]> {
+  const mediaDir = path.join(root, "data", "tweets_media");
+  try {
+    const files = await fs.readdir(mediaDir);
+    const filtered: string[] = [];
+    for (const f of files) {
+      if (!f.startsWith(`${id}-`)) continue;
+      const stat = await fs.stat(path.join(mediaDir, f));
+      if (stat.size > 0) filtered.push(f);
+    }
+    return filtered;
+  } catch {
+    return [];
+  }
+}
+
+/**
+ * Normalize a raw tweet/like structure from the archive format
+ */
+function normalizeTweetLike(
+  item: any,
+  _source: "twitter:tweet" | "twitter:like",
+): {
+  id: string;
+  text: string;
+  created_at: string;
+  parent_id?: string | null;
+  raw: any;
+} | null {
+  const t = item?.tweet ?? item?.like ?? item;
+  if (!t) return null;
+  const id = t.id || t.tweetId;
+  if (!id) return null;
+  const text = t.text || t.fullText || t.full_text || "";
+  const created_at = t.created_at || t.createdAt || "";
+  const parent_id = t.in_reply_to_status_id || t.inReplyTo || null;
+  return { id, text, created_at, parent_id, raw: t };
+}
+
+/**
+ * Ingest a Twitter/X archive into normalized ContentItem records
+ */
+export async function ingestTwitter(
+  rootPath: string,
+  logger: (l: Level, m: string) => void,
+): Promise<ContentItem[]> {
+  const manifestPath = path.join(rootPath, "data", "manifest.js");
+  const manifest: Manifest = await readJsonFromJs(manifestPath);
+  const types = manifest.dataTypes ?? {};
+  const out: ContentItem[] = [];
+
+  const selected: Array<"tweets" | "like"> = Object.keys(types).filter(
+    (t) => t === "tweets" || t === "like",
+  ) as any;
+
+  for (const dataType of selected) {
+    const info = types[dataType];
+    const files = info?.files ?? [];
+    if (!files.length) continue;
+
+    logger("info", `Processing ${files.length} files for ${dataType}`);
+
+    for (const f of files) {
+      const filePath = path.join(rootPath, f.fileName);
+      const raw = await fs.readFile(filePath, "utf8");
+      const cleaned = cleanJsonString(raw);
+      const data = parseLooseArray(cleaned);
+      if (!Array.isArray(data) || data.length === 0) continue;
+
+      for (const item of data) {
+        const norm = normalizeTweetLike(
+          item,
+          dataType === "tweets" ? "twitter:tweet" : "twitter:like",
+        );
+        if (!norm) continue;
+
+        const mediaFiles = await getMediaFiles(rootPath, norm.id);
+        const media: MediaAttachment[] = mediaFiles.map((fn) => ({
+          id: `${norm.id}_${fn.replace(/\.\w+$/, "")}`,
+          contentType: mediaTypeFromExt(fn),
+          absPath: path.join(rootPath, "data", "tweets_media", fn),
+          metadata: {
+            parent: norm.id,
+            media_info: norm.raw?.extended_entities?.media ?? [],
+          },
+        }));
+
+        out.push({
+          id: norm.id,
+          text: norm.text,
+          createdAt: norm.created_at ? toIso(norm.created_at) : new Date().toISOString(),
+          parentId: norm.parent_id ?? null,
+          source: dataType === "tweets" ? "twitter:tweet" : "twitter:like",
+          raw: norm.raw,
+          media,
+        });
+      }
+    }
+  }
+
+  logger("info", `Total normalized items: ${out.length}`);
+  return out;
+}
diff --git a/src/transforms/core.ts b/src/transforms/core.ts
new file mode 100644
index 0000000..97a66fc
--- /dev/null
+++ b/src/transforms/core.ts
@@ -0,0 +1,136 @@
+import { ContentItem, Thread, ChatMessage, Role, isRetweet } from "../core/types";
+
+/**
+ * Replace shortened URLs with expanded, strip t.co links, mentions, hashtags,
+ * collapse whitespace and trim.
+ */
+export function cleanText(
+  text: string,
+  entities?: { urls?: Array<{ url: string; expanded_url?: string }> },
+): string {
+  let t = text ?? "";
+  if (entities?.urls) {
+    for (const u of entities.urls) {
+      if (u.url && u.expanded_url) t = t.split(u.url).join(u.expanded_url);
+    }
+  }
+  t = t.replace(/https:\/\/t\.co\/\w+/g, "");
+  t = t.replace(/@\w+/g, "");
+  t = t.replace(/#\w+/g, "");
+  t = t.replace(/\s+/g, " ");
+  return t.trim();
+}
+
+export type FilterOptions = {
+  since?: string;
+  until?: string;
+  minLength: number;
+  excludeRt: boolean;
+  onlyThreads: boolean; // reserved for higher-level logic; not applied here
+  withMedia: boolean;
+};
+
+/**
+ * Apply stateless filters to a list of ContentItem.
+ * Note: onlyThreads is intentionally ignored here; thread selection happens after grouping.
+ */
+export function applyFilters(items: ContentItem[], opts: FilterOptions): ContentItem[] {
+  const sinceTime = opts.since ? new Date(opts.since).getTime() : -Infinity;
+  const untilTime = opts.until ? new Date(opts.until).getTime() : Infinity;
+
+  return items.filter((it) => {
+    const t = new Date(it.createdAt).getTime();
+    if (!(t >= sinceTime && t <= untilTime)) return false;
+    if (opts.excludeRt && isRetweet(it.text)) return false;
+    if (opts.minLength > 0 && (it.text?.trim().length ?? 0) < opts.minLength) return false;
+    if (opts.withMedia && !(it.media && it.media.length > 0)) return false;
+    return true;
+  });
+}
+
+/**
+ * Build a fast lookup map of items by id.
+ */
+export function indexById(items: ContentItem[]): Record<string, ContentItem> {
+  const m: Record<string, ContentItem> = {};
+  for (const it of items) {
+    if (it.id) m[it.id] = it;
+  }
+  return m;
+}
+
+/**
+ * Group items into tweet threads and mixed-source conversations.
+ * Threads are chains where all items come from "twitter:tweet".
+ * Conversations are chains which include other sources or likes.
+ */
+export function groupThreadsAndConversations(all: Record<string, ContentItem>): {
+  threads: Thread[];
+  conversations: ContentItem[][];
+} {
+  const processed = new Set<string>();
+  const threads: Thread[] = [];
+  const conversations: ContentItem[][] = [];
+
+  const items = Object.values(all);
+  for (const item of items) {
+    if (processed.has(item.id)) continue;
+
+    const chain: ContentItem[] = [item];
+    let current = item;
+    while (current.parentId && all[current.parentId]) {
+      const parent = all[current.parentId];
+      chain.push(parent);
+      current = parent;
+      if (processed.has(current.id)) break;
+    }
+    for (const c of chain) processed.add(c.id);
+
+    const allTweets = chain.every((c) => c.source === "twitter:tweet");
+    if (allTweets) {
+      const ordered = chain.slice().reverse(); // oldest → newest
+      threads.push({ id: ordered[0].id, items: ordered });
+    } else {
+      conversations.push(chain.slice().reverse()); // oldest → newest
+    }
+  }
+
+  return { threads, conversations };
+}
+
+/**
+ * Convert a conversation (ordered list of ContentItems) into ChatMessages:
+ * - Simple heuristic for roles (maintains prior behavior).
+ * - Clean text using cleanText().
+ * - Merge consecutive messages from the same role.
+ * - Trim trailing user messages to end on assistant if possible.
+ */
+export function messagesFromConversation(items: ContentItem[]): ChatMessage[] {
+  const msgs: ChatMessage[] = [];
+  let currentRole: Role | undefined;
+  let currentContent: string[] = [];
+
+  function flush() {
+    if (!currentRole) return;
+    const content = currentContent.join("\n\n").trim();
+    if (content) msgs.push({ role: currentRole, content });
+    currentContent = [];
+  }
+
+  for (const it of items) {
+    const role: Role = it.raw && "full_text" in (it.raw as any) ? "assistant" : "user";
+    const cleaned = cleanText(it.text, (it.raw as any)?.entities);
+    if (!cleaned) continue;
+
+    if (role !== currentRole && currentRole) flush();
+    currentRole = role;
+    currentContent.push(cleaned);
+  }
+  flush();
+
+  // Trim to last assistant message if present
+  for (let i = msgs.length - 1; i >= 0; i--) {
+    if (msgs[i].role === "assistant") return msgs.slice(0, i + 1);
+  }
+  return [];
+}
diff --git a/tsconfig.json b/tsconfig.json
new file mode 100644
index 0000000..f67c65c
--- /dev/null
+++ b/tsconfig.json
@@ -0,0 +1,30 @@
+{
+  "compilerOptions": {
+    "target": "ES2022",
+    "module": "ES2022",
+    "moduleResolution": "Bundler",
+    "lib": ["ES2022"],
+    "rootDir": "src",
+    "outDir": "dist",
+    "baseUrl": ".",
+    "paths": {
+      "@core/*": ["src/core/*"],
+      "@sources/*": ["src/sources/*"],
+      "@transforms/*": ["src/transforms/*"],
+      "@outputs/*": ["src/outputs/*"],
+      "@cli/*": ["src/cli/*"]
+    },
+    "types": ["node"],
+    "resolveJsonModule": true,
+    "esModuleInterop": true,
+    "allowSyntheticDefaultImports": true,
+    "verbatimModuleSyntax": false,
+    "moduleDetection": "force",
+    "skipLibCheck": true,
+    "sourceMap": true,
+    "declaration": false,
+    "noEmitOnError": true
+  },
+  "include": ["src/**/*"],
+  "exclude": ["node_modules", "dist", "tests/**"]
+}

From 502761ba160060ffd4c43468317af844742b3e71 Mon Sep 17 00:00:00 2001
From: deepfates <deepfates@gmail.com>
Date: Sat, 18 Oct 2025 16:21:58 -0700
Subject: [PATCH 2/4] feat(api): expose library entrypoint for composing
 sources/transforms/outputs and plugging in custom adapters

---
 package.json |  5 ++-
 src/index.ts | 94 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 98 insertions(+), 1 deletion(-)
 create mode 100644 src/index.ts

diff --git a/package.json b/package.json
index 9670686..77a127b 100644
--- a/package.json
+++ b/package.json
@@ -5,7 +5,10 @@
   "license": "MIT",
   "author": "",
   "type": "module",
-  "main": "dist/splice.js",
+  "main": "dist/index.js",
+  "exports": {
+    ".": "./dist/index.js"
+  },
   "bin": {
     "splice": "dist/splice.js"
   },
diff --git a/src/index.ts b/src/index.ts
new file mode 100644
index 0000000..132eed5
--- /dev/null
+++ b/src/index.ts
@@ -0,0 +1,94 @@
+/**
+ * Public library API
+ *
+ * This module re-exports the core types, utilities, source adapters, transforms,
+ * and output writers so consumers can:
+ * - Import only the pieces they need
+ * - Plug in proprietary/custom sources or outputs without forking
+ * - Compose their own pipelines programmatically
+ *
+ * Backwards-compatibility
+ * - The CLI uses the same functions exported here.
+ * - These extension interfaces (SourceAdapter/Transform/OutputAdapter) are intended
+ *   to remain stable; changes will be signaled with semver.
+ */
+
+// Re-export shared types, args, logger, and utilities
+export * from "./core/types";
+
+// Re-export built-in Source(s)
+export * from "./sources/twitter";
+
+// Re-export built-in Transforms
+export * from "./transforms/core";
+
+// Re-export built-in Outputs
+export {
+  writeMarkdown,
+  writeOAI,
+  writeNormalizedJSONL,
+  writeShareGPT,
+  writeStatsJSON,
+} from "./outputs/writers";
+
+/* ------------------------------- Extensions ------------------------------- */
+
+import type { Level, ContentItem, Thread } from "./core/types";
+
+/**
+ * Logger signature used across the pipeline
+ */
+export type Logger = (level: Level, message: string) => void;
+
+/**
+ * A pluggable input adapter for new sources (e.g., Bluesky, ChatGPT exports, custom archives).
+ * Implementors normalize their inputs to ContentItem[] and preserve rich metadata in `raw`.
+ */
+export interface SourceAdapter {
+  kind: string; // e.g., "twitter", "bluesky", "chatgpt", "custom:foo"
+  detect(pathOrUri: string): Promise<boolean>;
+  ingest(pathOrUri: string, log: Logger): Promise<ContentItem[]>;
+}
+
+/**
+ * Generic transform step. Keep these pure where possible so results
+ * can be cached by input hash + config hash when we add checkpointing.
+ */
+export interface Transform<Input, Output> {
+  name: string; // e.g., "filter", "group:threads", "score:length"
+  apply(
+    input: Input,
+    config: Record<string, unknown>,
+  ): Promise<{ output: Output; stats?: Record<string, number> }>;
+}
+
+/**
+ * Context provided to OutputAdapters.
+ */
+export interface OutputAdapterContext {
+  outDir: string;
+  dryRun?: boolean;
+  logger: Logger;
+}
+
+/**
+ * Arguments passed to OutputAdapters.
+ * Consumers can pass only what their adapter needs; undefined fields can be ignored.
+ */
+export interface OutputWriteArgs {
+  items?: ContentItem[];
+  threads?: Thread[];
+  conversations?: ContentItem[][];
+  systemMessage?: string;
+  // room for future fields (e.g., selection metadata, annotations, etc.)
+  [key: string]: unknown;
+}
+
+/**
+ * A pluggable output adapter for new render targets.
+ * Examples: proprietary JSONL, HTML site, custom training data, etc.
+ */
+export interface OutputAdapter {
+  name: string; // e.g., "markdown", "oai", "custom:myformat"
+  write(args: OutputWriteArgs, ctx: OutputAdapterContext): Promise<void>;
+}

From 76155b9101b8d6391a89bad5525fd81f498b58e9 Mon Sep 17 00:00:00 2001
From: deepfates <deepfates@gmail.com>
Date: Sat, 18 Oct 2025 16:32:28 -0700
Subject: [PATCH 3/4] docs: update README for modular architecture and library
 API; export public API for adapters

---
 README.md | 109 ++++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 81 insertions(+), 28 deletions(-)

diff --git a/README.md b/README.md
index 202ebf2..3bd0ead 100644
--- a/README.md
+++ b/README.md
@@ -1,43 +1,48 @@
 # 🫚 splice
 
-Convert your Twitter/X archive into normalized threads and export to Markdown, OAI JSONL, JSON (normalized items), and ShareGPT. Single-file TypeScript CLI, human-first, composable.
-
-- Human-friendly CLI (clig.dev principles)
-- Outputs:
-  - Markdown per-thread, plus non-thread tweets grouped by date
-  - OAI-compatible JSONL for language model fine-tuning/evaluation
-  - Normalized items JSONL (one item per line, for debugging/inspection)
+Convert social/chat archives into normalized threads and export to Markdown, OAI JSONL, JSON (normalized items), and ShareGPT. Modular TypeScript CLI and library with extensible sources → transforms → outputs.
+
+- Idiomatic CLI (clig.dev principles)
+- Modular architecture:
+  - sources: Twitter/X today; Bluesky, ChatGPT, etc. next
+  - transforms: filtering, grouping into threads/conversations, text cleaning
+  - outputs: Markdown, OAI JSONL, JSONL (normalized items), ShareGPT
+- Library API to compose your own pipeline or plug in proprietary adapters
 - Copies referenced media into an images/ folder
-- Works directly with your Twitter archive (manifest.js + data files)
+- JSONL artifacts for easy inspection and future checkpointing
 
 ## Why
 
-A minimalist CLI to turn your Twitter archive into:
-- Markdown you can read or publish
-- OAI JSONL you can train on
-- A normalized JSONL dump for inspection
+Turn your archives into:
+- Readable Markdown
+- OAI-compatible JSONL for training/eval
+- A normalized JSONL dump for inspection and reuse
 
 Today it imports Twitter/X. The plan is to splice in other archives (Bluesky, ChatGPT, Reddit, Glowfic, Hugging Face, …) and let you pick the strands you want to weave into a training set.
 
-This library started life as a python script. This is a TypeScript rewrite where development will continue. Versions of this codebase were used in the development of [deeperfates.com](https://deeperfates.com), [keltham.lol](https://keltham.lol), [youaretheassistantnow.com](https://youaretheassistantnow.com) and other personality clones.
+This library started life as a Python script. This is a TypeScript rewrite where development will continue. It has powered projects like [deeperfates.com](https://deeperfates.com), [keltham.lol](https://keltham.lol), and [youaretheassistantnow.com](https://youaretheassistantnow.com).
 
 More context: https://deepfates.com/convert-your-twitter-archive-into-training-data
 
-## Quick start
+## Quick start (CLI)
 
 Requirements:
-- Node.js 18+ (tested with recent Node LTS and current)
-- For direct execution: `tsx` (installed automatically when using `npx`)
+- Node.js 18+ (tested with recent LTS)
+- For direct execution: `tsx` (installed automatically with `npx`)
 
 Run with tsx (no build needed):
 
     npx tsx splice.ts --source /path/to/twitter-archive --out ./out
 
+Run the published CLI (after install):
+
+    npx splice --source /path/to/twitter-archive --out ./out
+
 Build then run with Node:
 
     npm install
     npm run build
-    node dist/splice.js --source /path/to/twitter-archive --out ./out
+    node dist/cli/splice.js --source /path/to/twitter-archive --out ./out
 
 Dev/watch mode:
 
@@ -50,18 +55,30 @@ Help (equivalent to `--help`):
     splice — convert a Twitter archive to Markdown, OAI JSONL, and/or JSON
 
     Usage:
-      splice --source <path> --out <dir> [--format markdown oai json] [--system-message <text>] [--dry-run] [--log-level <level>]
+      splice --source <path> --out <dir> [--format markdown oai json sharegpt] [--system-message <text>]
+             [--since <iso>] [--until <iso>] [--min-length <n>] [--exclude-rt] [--only-threads] [--with-media]
+             [--dry-run] [--stats-json] [--log-level <level>] [--json-stdout] [--quiet|-q] [--verbose] [--version|-V]
 
     Options:
       --source <path>            Path to the Twitter archive directory
       --out <dir>                Output directory
-      --format <fmt...>          One or more formats: markdown, oai, json (default: markdown oai)
+      --format <fmt...>          One or more formats: markdown, oai, json, sharegpt (default: markdown oai)
       --system-message <text>    System message for OAI JSONL (default: "You have been uploaded to the internet")
                                  Alias: --system
+      --since <iso>              Include items on/after this ISO date
+      --until <iso>              Include items on/before this ISO date
+      --min-length <n>           Minimum text length
+      --exclude-rt               Exclude retweets (RT ...)
+      --only-threads             Output threads only
+      --with-media               Only include items that have media
       --dry-run, -n              Plan only; don’t write files
+      --stats-json               Write a stats.json summary
       --log-level <level>        debug|info|warn|error (default: info)
-      --help, -h                 Show help
+      --json-stdout              Emit normalized items JSONL to stdout; logs to stderr
+      --quiet, -q                Errors only
+      --verbose                  Debug logging
       --version, -V              Show version
+      --help, -h                 Show help
 
     Environment:
       SPLICE_SYSTEM_MESSAGE      Alternative way to set the OAI system message
@@ -73,7 +90,7 @@ Exit codes:
 - 2: invalid arguments or source detection failed
 
 Stdout/Stderr:
-- Primary logs and progress go to stderr (so you can pipe stdout safely when we add stdout formats)
+- Primary logs go to stderr (so you can safely pipe stdout)
 - Data files are written to the output directory
 
 ## Examples
@@ -120,7 +137,7 @@ Dry run with debug logs (no files written):
 
 ## Input assumptions
 
-This first version supports the standard Twitter archive ZIP extracted to a directory that contains:
+Supports the standard Twitter/X archive ZIP extracted to a directory that contains:
 
 - `data/manifest.js`
 - `data/tweets_media/` (optional, for media assets)
@@ -143,9 +160,44 @@ On a successful run, you’ll see:
 - `out/stats.json` — summary (counts, threads/conversations, date range)
 
 Notes:
-- Filenames for threads are derived from the first five words of the top post (sanitized).
+- Thread filenames are derived from the top post’s first words (sanitized).
 - The OAI JSONL file includes a top-level “system” message (configurable).
 
+## Architecture (for contributors)
+
+- src/core — shared types, arg parsing, logger, utilities
+- src/sources — input adapters (twitter.ts)
+- src/transforms — filters, grouping, conversation mapping
+- src/outputs — writers for markdown/oai/json/sharegpt/stats
+- src/cli — CLI entrypoint wiring sources → transforms → outputs
+
+The code is structured so you can add new sources, transforms, or outputs without touching unrelated parts.
+
+## Library usage
+
+You can import and compose pieces in your own app:
+
+```ts
+import {
+  ingestTwitter,
+  applyFilters,
+  indexById,
+  groupThreadsAndConversations,
+  writeOAI,
+} from "@deepfates/splice";
+
+const items = await ingestTwitter("/path/to/archive", (l, m) => console.error(`[${l}] ${m}`));
+const filtered = applyFilters(items, { minLength: 20, excludeRt: true, withMedia: false });
+const all = indexById(filtered);
+const { threads, conversations } = groupThreadsAndConversations(all);
+await writeOAI(threads, conversations, "./out", "You have been uploaded to the internet", (l, m) => console.error(`[${l}] ${m}`), false);
+```
+
+Pluggable adapters (build proprietary ones privately and upstream later if you want):
+
+- SourceAdapter: `detect(pathOrUri)`, `ingest(pathOrUri, logger) → ContentItem[]`
+- OutputAdapter: `write(args, ctx)` where args may include `items`, `threads`, `conversations`, `systemMessage`, and ctx provides `outDir`, `dryRun`, and `logger`
+
 ## Development
 
 Install deps:
@@ -160,17 +212,17 @@ Watch mode:
 
     npm run dev -- --source /path/to/twitter-archive --out ./out
 
-Build (emits `dist/splice.js` and sets up the `splice` bin):
+Build (emits `dist/cli/splice.js` and sets up the `splice` bin; library API at `dist/index.js`):
 
     npm run build
 
 Run the built CLI:
 
-    node dist/splice.js --source /path/to/twitter-archive --out ./out
+    node dist/cli/splice.js --source /path/to/twitter-archive --out ./out
 
 ## Testing
 
-Run the full test suite (includes an integration test that verifies Markdown, OAI JSONL with system message, and normalized JSONL outputs):
+Run the full test suite (includes integration tests for Markdown, OAI JSONL with system message, media copying, and normalized JSONL):
 
     npm test
 
@@ -181,9 +233,10 @@ Watch tests:
 ## Roadmap (short)
 
 - More inputs: Bluesky, Reddit, ChatGPT, Glowfic, HF datasets
-- More outputs: ShareGPT, SQLite/Parquet/CSV
+- Checkpointing and resumable pipelines (JSONL-based manifests)
+- More outputs: ShareGPT enhancements, SQLite/Parquet/CSV
 - Better selection: persona/character filters, time ranges
-- Note tweets and improved role attribution
+- Improved role attribution and metadata preservation
 
 ## License
 

From 5a8a53da7aa5401fa8ab21878969a6f54f42cf9b Mon Sep 17 00:00:00 2001
From: deepfates <deepfates@gmail.com>
Date: Sat, 18 Oct 2025 16:52:30 -0700
Subject: [PATCH 4/4] chore: address review nits (bin path, format messages,
 role inference helper)

---
 package.json           |  8 ++++----
 src/cli/splice.ts      | 14 ++++++--------
 src/transforms/core.ts | 27 ++++++++++++++++++++++-----
 3 files changed, 32 insertions(+), 17 deletions(-)

diff --git a/package.json b/package.json
index 77a127b..06fb080 100644
--- a/package.json
+++ b/package.json
@@ -10,7 +10,7 @@
     ".": "./dist/index.js"
   },
   "bin": {
-    "splice": "dist/splice.js"
+    "splice": "dist/cli/splice.js"
   },
   "files": [
     "dist/**",
@@ -19,9 +19,9 @@
     "CHANGELOG.md"
   ],
   "scripts": {
-    "start": "tsx splice.ts",
-    "dev": "tsx watch splice.ts",
-    "build": "tsc --target ES2022 --module ES2022 --moduleResolution Bundler --outDir dist splice.ts",
+    "start": "tsx src/cli/splice.ts",
+    "dev": "tsx watch src/cli/splice.ts",
+    "build": "tsc -p tsconfig.json",
     "prepare": "npm run build",
     "test": "vitest run --reporter verbose",
     "test:watch": "vitest",
diff --git a/src/cli/splice.ts b/src/cli/splice.ts
index f77e023..55b972c 100644
--- a/src/cli/splice.ts
+++ b/src/cli/splice.ts
@@ -10,12 +10,7 @@ import * as path from "node:path";
 import * as fs from "node:fs/promises";
 import { fileURLToPath } from "node:url";
 
-import {
-  CLIOptions,
-  parseArgs,
-  makeLogger,
-  usage,
-} from "../core/types";
+import { CLIOptions, parseArgs, makeLogger, usage } from "../core/types";
 
 import { detectTwitterArchive, ingestTwitter } from "../sources/twitter";
 import {
@@ -193,7 +188,10 @@ async function main() {
     const validFormats = requested.filter((f) => allowedFormats.has(f));
     const invalidFormats = requested.filter((f) => !allowedFormats.has(f));
     for (const bad of invalidFormats) {
-      logger("warn", `Unknown format "${bad}". Supported: markdown, oai, json`);
+      logger(
+        "warn",
+        `Unknown format "${bad}". Supported: markdown, oai, json, sharegpt`,
+      );
     }
     const jsonStdout = argv.includes("--json-stdout");
 
@@ -209,7 +207,7 @@ async function main() {
     if (formatSpecified && validFormats.length === 0) {
       logger(
         "error",
-        "No valid formats requested. Supported: markdown, oai, json",
+        "No valid formats requested. Supported: markdown, oai, json, sharegpt",
       );
       process.stderr.write(usage() + "\n");
       process.exit(2);
diff --git a/src/transforms/core.ts b/src/transforms/core.ts
index 97a66fc..e101b0e 100644
--- a/src/transforms/core.ts
+++ b/src/transforms/core.ts
@@ -1,4 +1,10 @@
-import { ContentItem, Thread, ChatMessage, Role, isRetweet } from "../core/types";
+import {
+  ContentItem,
+  Thread,
+  ChatMessage,
+  Role,
+  isRetweet,
+} from "../core/types";
 
 /**
  * Replace shortened URLs with expanded, strip t.co links, mentions, hashtags,
@@ -34,7 +40,10 @@ export type FilterOptions = {
  * Apply stateless filters to a list of ContentItem.
  * Note: onlyThreads is intentionally ignored here; thread selection happens after grouping.
  */
-export function applyFilters(items: ContentItem[], opts: FilterOptions): ContentItem[] {
+export function applyFilters(
+  items: ContentItem[],
+  opts: FilterOptions,
+): ContentItem[] {
   const sinceTime = opts.since ? new Date(opts.since).getTime() : -Infinity;
   const untilTime = opts.until ? new Date(opts.until).getTime() : Infinity;
 
@@ -42,7 +51,8 @@ export function applyFilters(items: ContentItem[], opts: FilterOptions): Content
     const t = new Date(it.createdAt).getTime();
     if (!(t >= sinceTime && t <= untilTime)) return false;
     if (opts.excludeRt && isRetweet(it.text)) return false;
-    if (opts.minLength > 0 && (it.text?.trim().length ?? 0) < opts.minLength) return false;
+    if (opts.minLength > 0 && (it.text?.trim().length ?? 0) < opts.minLength)
+      return false;
     if (opts.withMedia && !(it.media && it.media.length > 0)) return false;
     return true;
   });
@@ -64,7 +74,9 @@ export function indexById(items: ContentItem[]): Record<string, ContentItem> {
  * Threads are chains where all items come from "twitter:tweet".
  * Conversations are chains which include other sources or likes.
  */
-export function groupThreadsAndConversations(all: Record<string, ContentItem>): {
+export function groupThreadsAndConversations(
+  all: Record<string, ContentItem>,
+): {
   threads: Thread[];
   conversations: ContentItem[][];
 } {
@@ -105,6 +117,11 @@ export function groupThreadsAndConversations(all: Record<string, ContentItem>):
  * - Merge consecutive messages from the same role.
  * - Trim trailing user messages to end on assistant if possible.
  */
+export function inferRole(it: ContentItem): Role {
+  // Heuristic: tweets that look like assistant outputs (e.g., have full_text) are "assistant"; others are "user"
+  return it.raw && "full_text" in (it.raw as any) ? "assistant" : "user";
+}
+
 export function messagesFromConversation(items: ContentItem[]): ChatMessage[] {
   const msgs: ChatMessage[] = [];
   let currentRole: Role | undefined;
@@ -118,7 +135,7 @@ export function messagesFromConversation(items: ContentItem[]): ChatMessage[] {
   }
 
   for (const it of items) {
-    const role: Role = it.raw && "full_text" in (it.raw as any) ? "assistant" : "user";
+    const role: Role = inferRole(it);
     const cleaned = cleanText(it.text, (it.raw as any)?.entities);
     if (!cleaned) continue;