diff --git a/.gitignore b/.gitignore index 08c323a..ce97fdd 100644 --- a/.gitignore +++ b/.gitignore @@ -51,4 +51,5 @@ tmp/ temp/ # Test outputs -out/ +out*/ +!src/outputs/ diff --git a/README.md b/README.md index 5bd5231..2d95a7f 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,30 @@ Turn your archives into: - OAI-compatible JSONL for training/eval - A normalized JSONL dump for inspection and reuse -Today it imports Twitter/X. The plan is to splice in other archives (Bluesky, ChatGPT, Reddit, Glowfic, Hugging Face, …) and let you pick the strands you want to weave into a training set. +Today it imports Twitter/X and Glowfic. The plan is to splice in other archives (Bluesky, ChatGPT, Reddit, Hugging Face, …) and let you pick the strands you want to weave into a training set. + +Glowfic input (threads, sections, boards) +- You can now target Glowfic threads, board sections, or entire boards by URL (examples: https://glowfic.com/posts/5506, https://glowfic.com/board_sections/703, https://glowfic.com/boards/215). +- Under the hood we use the glowfic-dl library to fetch and parse content consistently (classic view). +- Programmatic usage in splice: + - Import helpers from the library: `detectGlowficUri`, `fetchGlowficThreads`, `fetchGlowficThreadsMany`, `normalizeGlowficThread`, `normalizeGlowficThreadsToItems`, `conversationsFromGlowficUrl`, `conversationsFromGlowficUrls`, and `GlowficSourceAdapter`. + - Provide one or more Glowfic URLs and an assistant selector to build conversations where a specific character speaks as the assistant and all others are the user. + - Assistant selection options: + - Exact display name string (matches `character_display_name`, case-insensitive) + - Exact handle string (matches `character_handle`, case-insensitive) + - Regex match on display name, handle, or author + - Predicate function `(post) => boolean` + - Typical flow: + 1) Choose your URL(s): thread/section/board. + 2) Choose the assistant character (e.g., display name or handle). + 3) Call `conversationsFromGlowficUrl(url, assistant)` (or the `...Urls` variant) to get arrays of `{ role: "assistant" | "user", content }`. + 4) Feed those messages into your exporter or fine-tuning writer. +- Notes: + - Markdown: Content is normalized to Markdown by default for training-friendly text; relative links/images are made absolute. + - If you want generic items instead of conversations, use `GlowficSourceAdapter.ingest(url, logger)` which returns normalized `ContentItem[]`. + - To mirror the Doctor Who “script” style, select the relevant character as the assistant and enable consecutive-message merging; trailing user-only tails are trimmed by default so conversations end on an assistant reply. +- Install dependency: + - Add `glowfic-dl` to your project (Node 18+): `npm i glowfic-dl` This library started life as a Python script. This is a TypeScript rewrite where development will continue. It has powered projects like [deeperfates.com](https://deeperfates.com), [keltham.lol](https://keltham.lol), and [youaretheassistantnow.com](https://youaretheassistantnow.com). @@ -232,7 +255,7 @@ Watch tests: ## Roadmap (short) -- More inputs: Bluesky, Reddit, ChatGPT, Glowfic, HF datasets +- More inputs: Bluesky, Reddit, ChatGPT, HF datasets (Glowfic done) - Checkpointing and resumable pipelines (JSONL-based manifests) - More outputs: ShareGPT enhancements, SQLite/Parquet/CSV - Better selection: persona/character filters, time ranges diff --git a/package-lock.json b/package-lock.json index e9b1e8b..9cee7e2 100644 --- a/package-lock.json +++ b/package-lock.json @@ -9,7 +9,8 @@ "version": "0.1.1", "license": "MIT", "dependencies": { - "cosmiconfig": "^9.0.0" + "cosmiconfig": "^9.0.0", + "glowfic-dl": "^0.2.1" }, "bin": { "splice": "dist/cli/splice.js" @@ -497,6 +498,12 @@ "dev": true, "license": "MIT" }, + "node_modules/@mixmark-io/domino": { + "version": "2.2.0", + "resolved": "https://registry.npmjs.org/@mixmark-io/domino/-/domino-2.2.0.tgz", + "integrity": "sha512-Y28PR25bHXUg88kCV7nivXrP2Nj2RueZ3/l/jdx6J9f8J4nsEGcgX0Qe6lt7Pa+J79+kPiJU3LguR6O/6zrLOw==", + "license": "BSD-2-Clause" + }, "node_modules/@rollup/rollup-android-arm-eabi": { "version": "4.52.4", "resolved": "https://registry.npmjs.org/@rollup/rollup-android-arm-eabi/-/rollup-android-arm-eabi-4.52.4.tgz", @@ -842,6 +849,12 @@ "undici-types": "~6.21.0" } }, + "node_modules/@types/turndown": { + "version": "5.0.6", + "resolved": "https://registry.npmjs.org/@types/turndown/-/turndown-5.0.6.tgz", + "integrity": "sha512-ru00MoyeeouE5BX4gRL+6m/BsDfbRayOskWqUvh7CLGW+UXxHQItqALa38kKnOiZPqJrtzJUgAC2+F0rL1S4Pg==", + "license": "MIT" + }, "node_modules/@vitest/expect": { "version": "2.1.9", "resolved": "https://registry.npmjs.org/@vitest/expect/-/expect-2.1.9.tgz", @@ -955,6 +968,33 @@ "url": "https://opencollective.com/vitest" } }, + "node_modules/ansi-regex": { + "version": "6.2.2", + "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-6.2.2.tgz", + "integrity": "sha512-Bq3SmSpyFHaWjPk8If9yc6svM8c56dB5BAtW4Qbw5jHTwwXXcTLoRMkpDJp6VL0XzlWaCHTXrkFURMYmD0sLqg==", + "license": "MIT", + "engines": { + "node": ">=12" + }, + "funding": { + "url": "https://github.com/chalk/ansi-regex?sponsor=1" + } + }, + "node_modules/ansi-styles": { + "version": "4.3.0", + "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz", + "integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==", + "license": "MIT", + "dependencies": { + "color-convert": "^2.0.1" + }, + "engines": { + "node": ">=8" + }, + "funding": { + "url": "https://github.com/chalk/ansi-styles?sponsor=1" + } + }, "node_modules/argparse": { "version": "2.0.1", "resolved": "https://registry.npmjs.org/argparse/-/argparse-2.0.1.tgz", @@ -971,6 +1011,12 @@ "node": ">=12" } }, + "node_modules/boolbase": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/boolbase/-/boolbase-1.0.0.tgz", + "integrity": "sha512-JZOSA7Mo9sNGB8+UjSgzdLtokWAky1zbztM3WRLCbZ70/3cTANmQmOdR7y2g+J0e2WXywy1yS468tY+IruqEww==", + "license": "ISC" + }, "node_modules/cac": { "version": "6.7.14", "resolved": "https://registry.npmjs.org/cac/-/cac-6.7.14.tgz", @@ -1007,6 +1053,18 @@ "node": ">=18" } }, + "node_modules/chalk": { + "version": "5.6.2", + "resolved": "https://registry.npmjs.org/chalk/-/chalk-5.6.2.tgz", + "integrity": "sha512-7NzBL0rN6fMUW+f7A6Io4h40qQlG+xGmtMxfbnH/K7TAtt8JQWVQK+6g0UXKMeVJoyV5EkkNsErQ8pVD3bLHbA==", + "license": "MIT", + "engines": { + "node": "^12.17.0 || ^14.13 || >=16.0.0" + }, + "funding": { + "url": "https://github.com/chalk/chalk?sponsor=1" + } + }, "node_modules/check-error": { "version": "2.1.1", "resolved": "https://registry.npmjs.org/check-error/-/check-error-2.1.1.tgz", @@ -1017,6 +1075,142 @@ "node": ">= 16" } }, + "node_modules/cheerio": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/cheerio/-/cheerio-1.1.2.tgz", + "integrity": "sha512-IkxPpb5rS/d1IiLbHMgfPuS0FgiWTtFIm/Nj+2woXDLTZ7fOT2eqzgYbdMlLweqlHbsZjxEChoVK+7iph7jyQg==", + "license": "MIT", + "dependencies": { + "cheerio-select": "^2.1.0", + "dom-serializer": "^2.0.0", + "domhandler": "^5.0.3", + "domutils": "^3.2.2", + "encoding-sniffer": "^0.2.1", + "htmlparser2": "^10.0.0", + "parse5": "^7.3.0", + "parse5-htmlparser2-tree-adapter": "^7.1.0", + "parse5-parser-stream": "^7.1.2", + "undici": "^7.12.0", + "whatwg-mimetype": "^4.0.0" + }, + "engines": { + "node": ">=20.18.1" + }, + "funding": { + "url": "https://github.com/cheeriojs/cheerio?sponsor=1" + } + }, + "node_modules/cheerio-select": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/cheerio-select/-/cheerio-select-2.1.0.tgz", + "integrity": "sha512-9v9kG0LvzrlcungtnJtpGNxY+fzECQKhK4EGJX2vByejiMX84MFNQw4UxPJl3bFbTMw+Dfs37XaIkCwTZfLh4g==", + "license": "BSD-2-Clause", + "dependencies": { + "boolbase": "^1.0.0", + "css-select": "^5.1.0", + "css-what": "^6.1.0", + "domelementtype": "^2.3.0", + "domhandler": "^5.0.3", + "domutils": "^3.0.1" + }, + "funding": { + "url": "https://github.com/sponsors/fb55" + } + }, + "node_modules/cli-cursor": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/cli-cursor/-/cli-cursor-5.0.0.tgz", + "integrity": "sha512-aCj4O5wKyszjMmDT4tZj93kxyydN/K5zPWSCe6/0AV/AA1pqe5ZBIw0a2ZfPQV7lL5/yb5HsUreJ6UFAF1tEQw==", + "license": "MIT", + "dependencies": { + "restore-cursor": "^5.0.0" + }, + "engines": { + "node": ">=18" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/cli-spinners": { + "version": "3.3.0", + "resolved": "https://registry.npmjs.org/cli-spinners/-/cli-spinners-3.3.0.tgz", + "integrity": "sha512-/+40ljC3ONVnYIttjMWrlL51nItDAbBrq2upN8BPyvGU/2n5Oxw3tbNwORCaNuNqLJnxGqOfjUuhsv7l5Q4IsQ==", + "license": "MIT", + "engines": { + "node": ">=18.20" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/cliui": { + "version": "8.0.1", + "resolved": "https://registry.npmjs.org/cliui/-/cliui-8.0.1.tgz", + "integrity": "sha512-BSeNnyus75C4//NQ9gQt1/csTXyo/8Sb+afLAkzAptFuMsod9HFokGNudZpi/oQV73hnVK+sR+5PVRMd+Dr7YQ==", + "license": "ISC", + "dependencies": { + "string-width": "^4.2.0", + "strip-ansi": "^6.0.1", + "wrap-ansi": "^7.0.0" + }, + "engines": { + "node": ">=12" + } + }, + "node_modules/cliui/node_modules/ansi-regex": { + "version": "5.0.1", + "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-5.0.1.tgz", + "integrity": "sha512-quJQXlTSUGL2LH9SUXo8VwsY4soanhgo6LNSm84E1LBcE8s3O0wpdiRzyR9z/ZZJMlMWv37qOOb9pdJlMUEKFQ==", + "license": "MIT", + "engines": { + "node": ">=8" + } + }, + "node_modules/cliui/node_modules/string-width": { + "version": "4.2.3", + "resolved": "https://registry.npmjs.org/string-width/-/string-width-4.2.3.tgz", + "integrity": "sha512-wKyQRQpjJ0sIp62ErSZdGsjMJWsap5oRNihHhu6G7JVO/9jIB6UyevL+tXuOqrng8j/cxKTWyWUwvSTriiZz/g==", + "license": "MIT", + "dependencies": { + "emoji-regex": "^8.0.0", + "is-fullwidth-code-point": "^3.0.0", + "strip-ansi": "^6.0.1" + }, + "engines": { + "node": ">=8" + } + }, + "node_modules/cliui/node_modules/strip-ansi": { + "version": "6.0.1", + "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-6.0.1.tgz", + "integrity": "sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A==", + "license": "MIT", + "dependencies": { + "ansi-regex": "^5.0.1" + }, + "engines": { + "node": ">=8" + } + }, + "node_modules/color-convert": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/color-convert/-/color-convert-2.0.1.tgz", + "integrity": "sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ==", + "license": "MIT", + "dependencies": { + "color-name": "~1.1.4" + }, + "engines": { + "node": ">=7.0.0" + } + }, + "node_modules/color-name": { + "version": "1.1.4", + "resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.4.tgz", + "integrity": "sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==", + "license": "MIT" + }, "node_modules/cosmiconfig": { "version": "9.0.0", "resolved": "https://registry.npmjs.org/cosmiconfig/-/cosmiconfig-9.0.0.tgz", @@ -1058,6 +1252,43 @@ "node": ">= 8" } }, + "node_modules/css-select": { + "version": "5.2.2", + "resolved": "https://registry.npmjs.org/css-select/-/css-select-5.2.2.tgz", + "integrity": "sha512-TizTzUddG/xYLA3NXodFM0fSbNizXjOKhqiQQwvhlspadZokn1KDy0NZFS0wuEubIYAV5/c1/lAr0TaaFXEXzw==", + "license": "BSD-2-Clause", + "dependencies": { + "boolbase": "^1.0.0", + "css-what": "^6.1.0", + "domhandler": "^5.0.2", + "domutils": "^3.0.1", + "nth-check": "^2.0.1" + }, + "funding": { + "url": "https://github.com/sponsors/fb55" + } + }, + "node_modules/css-what": { + "version": "6.2.2", + "resolved": "https://registry.npmjs.org/css-what/-/css-what-6.2.2.tgz", + "integrity": "sha512-u/O3vwbptzhMs3L1fQE82ZSLHQQfto5gyZzwteVIEyeaY5Fc7R4dapF/BvRoSYFeqfBk4m0V1Vafq5Pjv25wvA==", + "license": "BSD-2-Clause", + "engines": { + "node": ">= 6" + }, + "funding": { + "url": "https://github.com/sponsors/fb55" + } + }, + "node_modules/data-uri-to-buffer": { + "version": "4.0.1", + "resolved": "https://registry.npmjs.org/data-uri-to-buffer/-/data-uri-to-buffer-4.0.1.tgz", + "integrity": "sha512-0R9ikRb668HB7QDxT1vkpuUBtqc53YyAwMwGeUFKRojY/NWKvdZ+9UYtRfGmhqNbRkTSVpMbmyhXipFFv2cb/A==", + "license": "MIT", + "engines": { + "node": ">= 12" + } + }, "node_modules/debug": { "version": "4.4.3", "resolved": "https://registry.npmjs.org/debug/-/debug-4.4.3.tgz", @@ -1086,6 +1317,92 @@ "node": ">=6" } }, + "node_modules/dom-serializer": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/dom-serializer/-/dom-serializer-2.0.0.tgz", + "integrity": "sha512-wIkAryiqt/nV5EQKqQpo3SToSOV9J0DnbJqwK7Wv/Trc92zIAYZ4FlMu+JPFW1DfGFt81ZTCGgDEabffXeLyJg==", + "license": "MIT", + "dependencies": { + "domelementtype": "^2.3.0", + "domhandler": "^5.0.2", + "entities": "^4.2.0" + }, + "funding": { + "url": "https://github.com/cheeriojs/dom-serializer?sponsor=1" + } + }, + "node_modules/domelementtype": { + "version": "2.3.0", + "resolved": "https://registry.npmjs.org/domelementtype/-/domelementtype-2.3.0.tgz", + "integrity": "sha512-OLETBj6w0OsagBwdXnPdN0cnMfF9opN69co+7ZrbfPGrdpPVNBUj02spi6B1N7wChLQiPn4CSH/zJvXw56gmHw==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/fb55" + } + ], + "license": "BSD-2-Clause" + }, + "node_modules/domhandler": { + "version": "5.0.3", + "resolved": "https://registry.npmjs.org/domhandler/-/domhandler-5.0.3.tgz", + "integrity": "sha512-cgwlv/1iFQiFnU96XXgROh8xTeetsnJiDsTc7TYCLFd9+/WNkIqPTxiM/8pSd8VIrhXGTf1Ny1q1hquVqDJB5w==", + "license": "BSD-2-Clause", + "dependencies": { + "domelementtype": "^2.3.0" + }, + "engines": { + "node": ">= 4" + }, + "funding": { + "url": "https://github.com/fb55/domhandler?sponsor=1" + } + }, + "node_modules/domutils": { + "version": "3.2.2", + "resolved": "https://registry.npmjs.org/domutils/-/domutils-3.2.2.tgz", + "integrity": "sha512-6kZKyUajlDuqlHKVX1w7gyslj9MPIXzIFiz/rGu35uC1wMi+kMhQwGhl4lt9unC9Vb9INnY9Z3/ZA3+FhASLaw==", + "license": "BSD-2-Clause", + "dependencies": { + "dom-serializer": "^2.0.0", + "domelementtype": "^2.3.0", + "domhandler": "^5.0.3" + }, + "funding": { + "url": "https://github.com/fb55/domutils?sponsor=1" + } + }, + "node_modules/emoji-regex": { + "version": "8.0.0", + "resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-8.0.0.tgz", + "integrity": "sha512-MSjYzcWNOA0ewAHpz0MxpYFvwg6yjy1NG3xteoqz644VCo/RPgnr1/GGt+ic3iJTzQ8Eu3TdM14SawnVUmGE6A==", + "license": "MIT" + }, + "node_modules/encoding-sniffer": { + "version": "0.2.1", + "resolved": "https://registry.npmjs.org/encoding-sniffer/-/encoding-sniffer-0.2.1.tgz", + "integrity": "sha512-5gvq20T6vfpekVtqrYQsSCFZ1wEg5+wW0/QaZMWkFr6BqD3NfKs0rLCx4rrVlSWJeZb5NBJgVLswK/w2MWU+Gw==", + "license": "MIT", + "dependencies": { + "iconv-lite": "^0.6.3", + "whatwg-encoding": "^3.1.1" + }, + "funding": { + "url": "https://github.com/fb55/encoding-sniffer?sponsor=1" + } + }, + "node_modules/entities": { + "version": "4.5.0", + "resolved": "https://registry.npmjs.org/entities/-/entities-4.5.0.tgz", + "integrity": "sha512-V0hjH4dGPh9Ao5p0MoRY6BVqtwCjhz6vI5LT8AJ55H+4g9/4vbHx1I54fS0XuclLhDHArPQCiMjDxjaL8fPxhw==", + "license": "BSD-2-Clause", + "engines": { + "node": ">=0.12" + }, + "funding": { + "url": "https://github.com/fb55/entities?sponsor=1" + } + }, "node_modules/env-paths": { "version": "2.2.1", "resolved": "https://registry.npmjs.org/env-paths/-/env-paths-2.2.1.tgz", @@ -1153,6 +1470,15 @@ "@esbuild/win32-x64": "0.25.10" } }, + "node_modules/escalade": { + "version": "3.2.0", + "resolved": "https://registry.npmjs.org/escalade/-/escalade-3.2.0.tgz", + "integrity": "sha512-WUj2qlxaQtO4g6Pq5c29GTcWGDyd8itL8zTlipgECz3JesAiiOKotd8JU6otB3PACgG6xkJUyVhboMS+bje/jA==", + "license": "MIT", + "engines": { + "node": ">=6" + } + }, "node_modules/estree-walker": { "version": "3.0.3", "resolved": "https://registry.npmjs.org/estree-walker/-/estree-walker-3.0.3.tgz", @@ -1200,6 +1526,29 @@ "node": ">=12.0.0" } }, + "node_modules/fetch-blob": { + "version": "3.2.0", + "resolved": "https://registry.npmjs.org/fetch-blob/-/fetch-blob-3.2.0.tgz", + "integrity": "sha512-7yAQpD2UMJzLi1Dqv7qFYnPbaPx7ZfFK6PiIxQ4PfkGPyNyl2Ugx+a/umUonmKqjhM4DnfbMvdX6otXq83soQQ==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/jimmywarting" + }, + { + "type": "paypal", + "url": "https://paypal.me/jimmywarting" + } + ], + "license": "MIT", + "dependencies": { + "node-domexception": "^1.0.0", + "web-streams-polyfill": "^3.0.3" + }, + "engines": { + "node": "^12.20 || >= 14.13" + } + }, "node_modules/figures": { "version": "6.1.0", "resolved": "https://registry.npmjs.org/figures/-/figures-6.1.0.tgz", @@ -1216,6 +1565,18 @@ "url": "https://github.com/sponsors/sindresorhus" } }, + "node_modules/formdata-polyfill": { + "version": "4.0.10", + "resolved": "https://registry.npmjs.org/formdata-polyfill/-/formdata-polyfill-4.0.10.tgz", + "integrity": "sha512-buewHzMvYL29jdeQTVILecSaZKnt/RJWjoZCF5OW60Z67/GmSLBkOFM7qh1PI3zFNtJbaZL5eQu1vLfazOwj4g==", + "license": "MIT", + "dependencies": { + "fetch-blob": "^3.1.2" + }, + "engines": { + "node": ">=12.20.0" + } + }, "node_modules/fsevents": { "version": "2.3.3", "resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.3.tgz", @@ -1231,6 +1592,27 @@ "node": "^8.16.0 || ^10.6.0 || >=11.0.0" } }, + "node_modules/get-caller-file": { + "version": "2.0.5", + "resolved": "https://registry.npmjs.org/get-caller-file/-/get-caller-file-2.0.5.tgz", + "integrity": "sha512-DyFP3BM/3YHTQOCUL/w0OZHR0lpKeGrxotcHWcqNEdnltqFwXVfhEBQ94eIo34AfQpo0rGki4cyIiftY06h2Fg==", + "license": "ISC", + "engines": { + "node": "6.* || 8.* || >= 10.*" + } + }, + "node_modules/get-east-asian-width": { + "version": "1.4.0", + "resolved": "https://registry.npmjs.org/get-east-asian-width/-/get-east-asian-width-1.4.0.tgz", + "integrity": "sha512-QZjmEOC+IT1uk6Rx0sX22V6uHWVwbdbxf1faPqJ1QhLdGgsRGCZoyaQBm/piRdJy/D2um6hM1UP7ZEeQ4EkP+Q==", + "license": "MIT", + "engines": { + "node": ">=18" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, "node_modules/get-stream": { "version": "9.0.1", "resolved": "https://registry.npmjs.org/get-stream/-/get-stream-9.0.1.tgz", @@ -1261,6 +1643,58 @@ "url": "https://github.com/privatenumber/get-tsconfig?sponsor=1" } }, + "node_modules/glowfic-dl": { + "version": "0.2.1", + "resolved": "https://registry.npmjs.org/glowfic-dl/-/glowfic-dl-0.2.1.tgz", + "integrity": "sha512-2lqktxHgXedrTOoM0pv7Sf14z0JETiayOAo/Mpd8Ow0Ke0Wx3WvoNjrOaTwxeVGC2PvDcsy47UVtpttRj6Sk5Q==", + "dependencies": { + "@types/turndown": "^5.0.6", + "cheerio": "^1.0.0-rc.12", + "node-fetch": "^3.3.2", + "ora": "^9.0.0", + "turndown": "^7.2.2", + "yargs": "^17.7.2", + "zod": "^3.23.8" + }, + "bin": { + "glowfic-dl": "dist/cli.js", + "glowfic-dl-ts": "dist/cli.js" + }, + "engines": { + "node": ">=18" + } + }, + "node_modules/htmlparser2": { + "version": "10.0.0", + "resolved": "https://registry.npmjs.org/htmlparser2/-/htmlparser2-10.0.0.tgz", + "integrity": "sha512-TwAZM+zE5Tq3lrEHvOlvwgj1XLWQCtaaibSN11Q+gGBAS7Y1uZSWwXXRe4iF6OXnaq1riyQAPFOBtYc77Mxq0g==", + "funding": [ + "https://github.com/fb55/htmlparser2?sponsor=1", + { + "type": "github", + "url": "https://github.com/sponsors/fb55" + } + ], + "license": "MIT", + "dependencies": { + "domelementtype": "^2.3.0", + "domhandler": "^5.0.3", + "domutils": "^3.2.1", + "entities": "^6.0.0" + } + }, + "node_modules/htmlparser2/node_modules/entities": { + "version": "6.0.1", + "resolved": "https://registry.npmjs.org/entities/-/entities-6.0.1.tgz", + "integrity": "sha512-aN97NXWF6AWBTahfVOIrB/NShkzi5H7F9r1s9mD3cDj4Ko5f2qhhVoYMibXF7GlLveb/D2ioWay8lxI97Ven3g==", + "license": "BSD-2-Clause", + "engines": { + "node": ">=0.12" + }, + "funding": { + "url": "https://github.com/fb55/entities?sponsor=1" + } + }, "node_modules/human-signals": { "version": "8.0.1", "resolved": "https://registry.npmjs.org/human-signals/-/human-signals-8.0.1.tgz", @@ -1271,6 +1705,18 @@ "node": ">=18.18.0" } }, + "node_modules/iconv-lite": { + "version": "0.6.3", + "resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.6.3.tgz", + "integrity": "sha512-4fCk79wshMdzMp2rH06qWrJE4iolqLhCUH+OiuIgU++RB0+94NlDL81atO7GX55uUKueo0txHNtvEyI6D7WdMw==", + "license": "MIT", + "dependencies": { + "safer-buffer": ">= 2.1.2 < 3.0.0" + }, + "engines": { + "node": ">=0.10.0" + } + }, "node_modules/import-fresh": { "version": "3.3.1", "resolved": "https://registry.npmjs.org/import-fresh/-/import-fresh-3.3.1.tgz", @@ -1293,6 +1739,27 @@ "integrity": "sha512-zz06S8t0ozoDXMG+ube26zeCTNXcKIPJZJi8hBrF4idCLms4CG9QtK7qBl1boi5ODzFpjswb5JPmHCbMpjaYzg==", "license": "MIT" }, + "node_modules/is-fullwidth-code-point": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/is-fullwidth-code-point/-/is-fullwidth-code-point-3.0.0.tgz", + "integrity": "sha512-zymm5+u+sCsSWyD9qNaejV3DFvhCKclKdizYaJUuHA83RLjb7nSuGnddCHGv0hk+KY7BMAlsWeK4Ueg6EV6XQg==", + "license": "MIT", + "engines": { + "node": ">=8" + } + }, + "node_modules/is-interactive": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/is-interactive/-/is-interactive-2.0.0.tgz", + "integrity": "sha512-qP1vozQRI+BMOPcjFzrjXuQvdak2pHNUMZoeG2eRbiSqyvbEf/wQtEOTOX1guk6E3t36RkaqiSt8A/6YElNxLQ==", + "license": "MIT", + "engines": { + "node": ">=12" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, "node_modules/is-plain-obj": { "version": "4.1.0", "resolved": "https://registry.npmjs.org/is-plain-obj/-/is-plain-obj-4.1.0.tgz", @@ -1323,7 +1790,6 @@ "version": "2.1.0", "resolved": "https://registry.npmjs.org/is-unicode-supported/-/is-unicode-supported-2.1.0.tgz", "integrity": "sha512-mE00Gnza5EEB3Ds0HfMyllZzbBrmLOX3vfWoj9A9PEnTfratQ/BcaJOuMhnkhjXvb2+FkY3VuHqtAGpTPmglFQ==", - "dev": true, "license": "MIT", "engines": { "node": ">=18" @@ -1369,6 +1835,22 @@ "integrity": "sha512-7ylylesZQ/PV29jhEDl3Ufjo6ZX7gCqJr5F7PKrqc93v7fzSymt1BpwEU8nAUXs8qzzvqhbjhK5QZg6Mt/HkBg==", "license": "MIT" }, + "node_modules/log-symbols": { + "version": "7.0.1", + "resolved": "https://registry.npmjs.org/log-symbols/-/log-symbols-7.0.1.tgz", + "integrity": "sha512-ja1E3yCr9i/0hmBVaM0bfwDjnGy8I/s6PP4DFp+yP+a+mrHO4Rm7DtmnqROTUkHIkqffC84YY7AeqX6oFk0WFg==", + "license": "MIT", + "dependencies": { + "is-unicode-supported": "^2.0.0", + "yoctocolors": "^2.1.1" + }, + "engines": { + "node": ">=18" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, "node_modules/loupe": { "version": "3.2.1", "resolved": "https://registry.npmjs.org/loupe/-/loupe-3.2.1.tgz", @@ -1386,6 +1868,18 @@ "@jridgewell/sourcemap-codec": "^1.5.5" } }, + "node_modules/mimic-function": { + "version": "5.0.1", + "resolved": "https://registry.npmjs.org/mimic-function/-/mimic-function-5.0.1.tgz", + "integrity": "sha512-VP79XUPxV2CigYP3jWwAUFSku2aKqBH7uTAapFWCBqutsbmDo96KY5o8uh6U+/YSIn5OxJnXp73beVkpqMIGhA==", + "license": "MIT", + "engines": { + "node": ">=18" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, "node_modules/ms": { "version": "2.1.3", "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.3.tgz", @@ -1412,6 +1906,44 @@ "node": "^10 || ^12 || ^13.7 || ^14 || >=15.0.1" } }, + "node_modules/node-domexception": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/node-domexception/-/node-domexception-1.0.0.tgz", + "integrity": "sha512-/jKZoMpw0F8GRwl4/eLROPA3cfcXtLApP0QzLmUT/HuPCZWyB7IY9ZrMeKw2O/nFIqPQB3PVM9aYm0F312AXDQ==", + "deprecated": "Use your platform's native DOMException instead", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/jimmywarting" + }, + { + "type": "github", + "url": "https://paypal.me/jimmywarting" + } + ], + "license": "MIT", + "engines": { + "node": ">=10.5.0" + } + }, + "node_modules/node-fetch": { + "version": "3.3.2", + "resolved": "https://registry.npmjs.org/node-fetch/-/node-fetch-3.3.2.tgz", + "integrity": "sha512-dRB78srN/l6gqWulah9SrxeYnxeddIG30+GOqK/9OlLVyLg3HPnr6SqOWTWOXKRwC2eGYCkZ59NNuSgvSrpgOA==", + "license": "MIT", + "dependencies": { + "data-uri-to-buffer": "^4.0.0", + "fetch-blob": "^3.1.4", + "formdata-polyfill": "^4.0.10" + }, + "engines": { + "node": "^12.20.0 || ^14.13.1 || >=16.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/node-fetch" + } + }, "node_modules/npm-run-path": { "version": "6.0.0", "resolved": "https://registry.npmjs.org/npm-run-path/-/npm-run-path-6.0.0.tgz", @@ -1442,6 +1974,56 @@ "url": "https://github.com/sponsors/sindresorhus" } }, + "node_modules/nth-check": { + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/nth-check/-/nth-check-2.1.1.tgz", + "integrity": "sha512-lqjrjmaOoAnWfMmBPL+XNnynZh2+swxiX3WUE0s4yEHI6m+AwrK2UZOimIRl3X/4QctVqS8AiZjFqyOGrMXb/w==", + "license": "BSD-2-Clause", + "dependencies": { + "boolbase": "^1.0.0" + }, + "funding": { + "url": "https://github.com/fb55/nth-check?sponsor=1" + } + }, + "node_modules/onetime": { + "version": "7.0.0", + "resolved": "https://registry.npmjs.org/onetime/-/onetime-7.0.0.tgz", + "integrity": "sha512-VXJjc87FScF88uafS3JllDgvAm+c/Slfz06lorj2uAY34rlUu0Nt+v8wreiImcrgAjjIHp1rXpTDlLOGw29WwQ==", + "license": "MIT", + "dependencies": { + "mimic-function": "^5.0.0" + }, + "engines": { + "node": ">=18" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/ora": { + "version": "9.0.0", + "resolved": "https://registry.npmjs.org/ora/-/ora-9.0.0.tgz", + "integrity": "sha512-m0pg2zscbYgWbqRR6ABga5c3sZdEon7bSgjnlXC64kxtxLOyjRcbbUkLj7HFyy/FTD+P2xdBWu8snGhYI0jc4A==", + "license": "MIT", + "dependencies": { + "chalk": "^5.6.2", + "cli-cursor": "^5.0.0", + "cli-spinners": "^3.2.0", + "is-interactive": "^2.0.0", + "is-unicode-supported": "^2.1.0", + "log-symbols": "^7.0.1", + "stdin-discarder": "^0.2.2", + "string-width": "^8.1.0", + "strip-ansi": "^7.1.2" + }, + "engines": { + "node": ">=20" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, "node_modules/parent-module": { "version": "1.0.1", "resolved": "https://registry.npmjs.org/parent-module/-/parent-module-1.0.1.tgz", @@ -1485,6 +2067,55 @@ "url": "https://github.com/sponsors/sindresorhus" } }, + "node_modules/parse5": { + "version": "7.3.0", + "resolved": "https://registry.npmjs.org/parse5/-/parse5-7.3.0.tgz", + "integrity": "sha512-IInvU7fabl34qmi9gY8XOVxhYyMyuH2xUNpb2q8/Y+7552KlejkRvqvD19nMoUW/uQGGbqNpA6Tufu5FL5BZgw==", + "license": "MIT", + "dependencies": { + "entities": "^6.0.0" + }, + "funding": { + "url": "https://github.com/inikulin/parse5?sponsor=1" + } + }, + "node_modules/parse5-htmlparser2-tree-adapter": { + "version": "7.1.0", + "resolved": "https://registry.npmjs.org/parse5-htmlparser2-tree-adapter/-/parse5-htmlparser2-tree-adapter-7.1.0.tgz", + "integrity": "sha512-ruw5xyKs6lrpo9x9rCZqZZnIUntICjQAd0Wsmp396Ul9lN/h+ifgVV1x1gZHi8euej6wTfpqX8j+BFQxF0NS/g==", + "license": "MIT", + "dependencies": { + "domhandler": "^5.0.3", + "parse5": "^7.0.0" + }, + "funding": { + "url": "https://github.com/inikulin/parse5?sponsor=1" + } + }, + "node_modules/parse5-parser-stream": { + "version": "7.1.2", + "resolved": "https://registry.npmjs.org/parse5-parser-stream/-/parse5-parser-stream-7.1.2.tgz", + "integrity": "sha512-JyeQc9iwFLn5TbvvqACIF/VXG6abODeB3Fwmv/TGdLk2LfbWkaySGY72at4+Ty7EkPZj854u4CrICqNk2qIbow==", + "license": "MIT", + "dependencies": { + "parse5": "^7.0.0" + }, + "funding": { + "url": "https://github.com/inikulin/parse5?sponsor=1" + } + }, + "node_modules/parse5/node_modules/entities": { + "version": "6.0.1", + "resolved": "https://registry.npmjs.org/entities/-/entities-6.0.1.tgz", + "integrity": "sha512-aN97NXWF6AWBTahfVOIrB/NShkzi5H7F9r1s9mD3cDj4Ko5f2qhhVoYMibXF7GlLveb/D2ioWay8lxI97Ven3g==", + "license": "BSD-2-Clause", + "engines": { + "node": ">=0.12" + }, + "funding": { + "url": "https://github.com/fb55/entities?sponsor=1" + } + }, "node_modules/path-key": { "version": "3.1.1", "resolved": "https://registry.npmjs.org/path-key/-/path-key-3.1.1.tgz", @@ -1563,6 +2194,15 @@ "url": "https://github.com/sponsors/sindresorhus" } }, + "node_modules/require-directory": { + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/require-directory/-/require-directory-2.1.1.tgz", + "integrity": "sha512-fGxEI7+wsG9xrvdjsrlmL22OMTTiHRwAMroiEeMgq8gzoLC/PQr7RsRDSTLUg/bZAZtF+TVIkHc6/4RIKrui+Q==", + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, "node_modules/resolve-from": { "version": "4.0.0", "resolved": "https://registry.npmjs.org/resolve-from/-/resolve-from-4.0.0.tgz", @@ -1582,6 +2222,22 @@ "url": "https://github.com/privatenumber/resolve-pkg-maps?sponsor=1" } }, + "node_modules/restore-cursor": { + "version": "5.1.0", + "resolved": "https://registry.npmjs.org/restore-cursor/-/restore-cursor-5.1.0.tgz", + "integrity": "sha512-oMA2dcrw6u0YfxJQXm342bFKX/E4sG9rbTzO9ptUcR/e8A33cHuvStiYOwH7fszkZlZ1z/ta9AAoPk2F4qIOHA==", + "license": "MIT", + "dependencies": { + "onetime": "^7.0.0", + "signal-exit": "^4.1.0" + }, + "engines": { + "node": ">=18" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, "node_modules/rollup": { "version": "4.52.4", "resolved": "https://registry.npmjs.org/rollup/-/rollup-4.52.4.tgz", @@ -1624,6 +2280,12 @@ "fsevents": "~2.3.2" } }, + "node_modules/safer-buffer": { + "version": "2.1.2", + "resolved": "https://registry.npmjs.org/safer-buffer/-/safer-buffer-2.1.2.tgz", + "integrity": "sha512-YZo3K82SD7Riyi0E1EQPojLz7kpepnSQI9IyPbHHg1XXXevb5dJI7tpyN2ADxGcQbHG7vcyRHk0cbwqcQriUtg==", + "license": "MIT" + }, "node_modules/shebang-command": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/shebang-command/-/shebang-command-2.0.0.tgz", @@ -1658,7 +2320,6 @@ "version": "4.1.0", "resolved": "https://registry.npmjs.org/signal-exit/-/signal-exit-4.1.0.tgz", "integrity": "sha512-bzyZ1e88w9O1iNJbKnOlvYTrWPDl46O1bG0D3XInv+9tkPrxrN8jUUTiFlDkkmKWgn1M6CfIA13SuGqOa9Korw==", - "dev": true, "license": "ISC", "engines": { "node": ">=14" @@ -1691,6 +2352,49 @@ "dev": true, "license": "MIT" }, + "node_modules/stdin-discarder": { + "version": "0.2.2", + "resolved": "https://registry.npmjs.org/stdin-discarder/-/stdin-discarder-0.2.2.tgz", + "integrity": "sha512-UhDfHmA92YAlNnCfhmq0VeNL5bDbiZGg7sZ2IvPsXubGkiNa9EC+tUTsjBRsYUAz87btI6/1wf4XoVvQ3uRnmQ==", + "license": "MIT", + "engines": { + "node": ">=18" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/string-width": { + "version": "8.1.0", + "resolved": "https://registry.npmjs.org/string-width/-/string-width-8.1.0.tgz", + "integrity": "sha512-Kxl3KJGb/gxkaUMOjRsQ8IrXiGW75O4E3RPjFIINOVH8AMl2SQ/yWdTzWwF3FevIX9LcMAjJW+GRwAlAbTSXdg==", + "license": "MIT", + "dependencies": { + "get-east-asian-width": "^1.3.0", + "strip-ansi": "^7.1.0" + }, + "engines": { + "node": ">=20" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/strip-ansi": { + "version": "7.1.2", + "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-7.1.2.tgz", + "integrity": "sha512-gmBGslpoQJtgnMAvOVqGZpEz9dyoKTCzy2nfz/n8aIFhN/jCE/rCmcxabB6jOOHV+0WNnylOxaxBQPSvcWklhA==", + "license": "MIT", + "dependencies": { + "ansi-regex": "^6.0.1" + }, + "engines": { + "node": ">=12" + }, + "funding": { + "url": "https://github.com/chalk/strip-ansi?sponsor=1" + } + }, "node_modules/strip-final-newline": { "version": "4.0.0", "resolved": "https://registry.npmjs.org/strip-final-newline/-/strip-final-newline-4.0.0.tgz", @@ -1768,6 +2472,15 @@ "fsevents": "~2.3.3" } }, + "node_modules/turndown": { + "version": "7.2.2", + "resolved": "https://registry.npmjs.org/turndown/-/turndown-7.2.2.tgz", + "integrity": "sha512-1F7db8BiExOKxjSMU2b7if62D/XOyQyZbPKq/nUwopfgnHlqXHqQ0lvfUTeUIr1lZJzOPFn43dODyMSIfvWRKQ==", + "license": "MIT", + "dependencies": { + "@mixmark-io/domino": "^2.2.0" + } + }, "node_modules/typescript": { "version": "5.9.3", "resolved": "https://registry.npmjs.org/typescript/-/typescript-5.9.3.tgz", @@ -1782,6 +2495,15 @@ "node": ">=14.17" } }, + "node_modules/undici": { + "version": "7.16.0", + "resolved": "https://registry.npmjs.org/undici/-/undici-7.16.0.tgz", + "integrity": "sha512-QEg3HPMll0o3t2ourKwOeUAZ159Kn9mx5pnzHRQO8+Wixmh88YdZRiIwat0iNzNNXn0yoEtXJqFpyW7eM8BV7g==", + "license": "MIT", + "engines": { + "node": ">=20.18.1" + } + }, "node_modules/undici-types": { "version": "6.21.0", "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-6.21.0.tgz", @@ -2381,6 +3103,36 @@ } } }, + "node_modules/web-streams-polyfill": { + "version": "3.3.3", + "resolved": "https://registry.npmjs.org/web-streams-polyfill/-/web-streams-polyfill-3.3.3.tgz", + "integrity": "sha512-d2JWLCivmZYTSIoge9MsgFCZrt571BikcWGYkjC1khllbTeDlGqZ2D8vD8E/lJa8WGWbb7Plm8/XJYV7IJHZZw==", + "license": "MIT", + "engines": { + "node": ">= 8" + } + }, + "node_modules/whatwg-encoding": { + "version": "3.1.1", + "resolved": "https://registry.npmjs.org/whatwg-encoding/-/whatwg-encoding-3.1.1.tgz", + "integrity": "sha512-6qN4hJdMwfYBtE3YBTTHhoeuUrDBPZmbQaxWAqSALV/MeEnR5z1xd8UKud2RAkFoPkmB+hli1TZSnyi84xz1vQ==", + "license": "MIT", + "dependencies": { + "iconv-lite": "0.6.3" + }, + "engines": { + "node": ">=18" + } + }, + "node_modules/whatwg-mimetype": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/whatwg-mimetype/-/whatwg-mimetype-4.0.0.tgz", + "integrity": "sha512-QaKxh0eNIi2mE9p2vEdzfagOKHCcj1pJ56EEHGQOVxp8r9/iszLUUV7v89x9O1p/T+NlTM5W7jW6+cz4Fq1YVg==", + "license": "MIT", + "engines": { + "node": ">=18" + } + }, "node_modules/which": { "version": "2.0.2", "resolved": "https://registry.npmjs.org/which/-/which-2.0.2.tgz", @@ -2414,11 +3166,133 @@ "node": ">=8" } }, + "node_modules/wrap-ansi": { + "version": "7.0.0", + "resolved": "https://registry.npmjs.org/wrap-ansi/-/wrap-ansi-7.0.0.tgz", + "integrity": "sha512-YVGIj2kamLSTxw6NsZjoBxfSwsn0ycdesmc4p+Q21c5zPuZ1pl+NfxVdxPtdHvmNVOQ6XSYG4AUtyt/Fi7D16Q==", + "license": "MIT", + "dependencies": { + "ansi-styles": "^4.0.0", + "string-width": "^4.1.0", + "strip-ansi": "^6.0.0" + }, + "engines": { + "node": ">=10" + }, + "funding": { + "url": "https://github.com/chalk/wrap-ansi?sponsor=1" + } + }, + "node_modules/wrap-ansi/node_modules/ansi-regex": { + "version": "5.0.1", + "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-5.0.1.tgz", + "integrity": "sha512-quJQXlTSUGL2LH9SUXo8VwsY4soanhgo6LNSm84E1LBcE8s3O0wpdiRzyR9z/ZZJMlMWv37qOOb9pdJlMUEKFQ==", + "license": "MIT", + "engines": { + "node": ">=8" + } + }, + "node_modules/wrap-ansi/node_modules/string-width": { + "version": "4.2.3", + "resolved": "https://registry.npmjs.org/string-width/-/string-width-4.2.3.tgz", + "integrity": "sha512-wKyQRQpjJ0sIp62ErSZdGsjMJWsap5oRNihHhu6G7JVO/9jIB6UyevL+tXuOqrng8j/cxKTWyWUwvSTriiZz/g==", + "license": "MIT", + "dependencies": { + "emoji-regex": "^8.0.0", + "is-fullwidth-code-point": "^3.0.0", + "strip-ansi": "^6.0.1" + }, + "engines": { + "node": ">=8" + } + }, + "node_modules/wrap-ansi/node_modules/strip-ansi": { + "version": "6.0.1", + "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-6.0.1.tgz", + "integrity": "sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A==", + "license": "MIT", + "dependencies": { + "ansi-regex": "^5.0.1" + }, + "engines": { + "node": ">=8" + } + }, + "node_modules/y18n": { + "version": "5.0.8", + "resolved": "https://registry.npmjs.org/y18n/-/y18n-5.0.8.tgz", + "integrity": "sha512-0pfFzegeDWJHJIAmTLRP2DwHjdF5s7jo9tuztdQxAhINCdvS+3nGINqPd00AphqJR/0LhANUS6/+7SCb98YOfA==", + "license": "ISC", + "engines": { + "node": ">=10" + } + }, + "node_modules/yargs": { + "version": "17.7.2", + "resolved": "https://registry.npmjs.org/yargs/-/yargs-17.7.2.tgz", + "integrity": "sha512-7dSzzRQ++CKnNI/krKnYRV7JKKPUXMEh61soaHKg9mrWEhzFWhFnxPxGl+69cD1Ou63C13NUPCnmIcrvqCuM6w==", + "license": "MIT", + "dependencies": { + "cliui": "^8.0.1", + "escalade": "^3.1.1", + "get-caller-file": "^2.0.5", + "require-directory": "^2.1.1", + "string-width": "^4.2.3", + "y18n": "^5.0.5", + "yargs-parser": "^21.1.1" + }, + "engines": { + "node": ">=12" + } + }, + "node_modules/yargs-parser": { + "version": "21.1.1", + "resolved": "https://registry.npmjs.org/yargs-parser/-/yargs-parser-21.1.1.tgz", + "integrity": "sha512-tVpsJW7DdjecAiFpbIB1e3qxIQsE6NoPc5/eTdrbbIC4h0LVsWhnoa3g+m2HclBIujHzsxZ4VJVA+GUuc2/LBw==", + "license": "ISC", + "engines": { + "node": ">=12" + } + }, + "node_modules/yargs/node_modules/ansi-regex": { + "version": "5.0.1", + "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-5.0.1.tgz", + "integrity": "sha512-quJQXlTSUGL2LH9SUXo8VwsY4soanhgo6LNSm84E1LBcE8s3O0wpdiRzyR9z/ZZJMlMWv37qOOb9pdJlMUEKFQ==", + "license": "MIT", + "engines": { + "node": ">=8" + } + }, + "node_modules/yargs/node_modules/string-width": { + "version": "4.2.3", + "resolved": "https://registry.npmjs.org/string-width/-/string-width-4.2.3.tgz", + "integrity": "sha512-wKyQRQpjJ0sIp62ErSZdGsjMJWsap5oRNihHhu6G7JVO/9jIB6UyevL+tXuOqrng8j/cxKTWyWUwvSTriiZz/g==", + "license": "MIT", + "dependencies": { + "emoji-regex": "^8.0.0", + "is-fullwidth-code-point": "^3.0.0", + "strip-ansi": "^6.0.1" + }, + "engines": { + "node": ">=8" + } + }, + "node_modules/yargs/node_modules/strip-ansi": { + "version": "6.0.1", + "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-6.0.1.tgz", + "integrity": "sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A==", + "license": "MIT", + "dependencies": { + "ansi-regex": "^5.0.1" + }, + "engines": { + "node": ">=8" + } + }, "node_modules/yoctocolors": { "version": "2.1.2", "resolved": "https://registry.npmjs.org/yoctocolors/-/yoctocolors-2.1.2.tgz", "integrity": "sha512-CzhO+pFNo8ajLM2d2IW/R93ipy99LWjtwblvC1RsoSUMZgyLbYFr221TnSNT7GjGdYui6P459mw9JH/g/zW2ug==", - "dev": true, "license": "MIT", "engines": { "node": ">=18" @@ -2426,6 +3300,15 @@ "funding": { "url": "https://github.com/sponsors/sindresorhus" } + }, + "node_modules/zod": { + "version": "3.25.76", + "resolved": "https://registry.npmjs.org/zod/-/zod-3.25.76.tgz", + "integrity": "sha512-gzUt/qt81nXsFGKIFcC3YnfEAx5NkunCfnDlvuBSSFS02bcXu4Lmea0AFIUwbLWxWPx3d9p8S5QoaujKcNQxcQ==", + "license": "MIT", + "funding": { + "url": "https://github.com/sponsors/colinhacks" + } } } } diff --git a/package.json b/package.json index 06fb080..9f2007b 100644 --- a/package.json +++ b/package.json @@ -54,7 +54,8 @@ "node": ">=18" }, "dependencies": { - "cosmiconfig": "^9.0.0" + "cosmiconfig": "^9.0.0", + "glowfic-dl": "^0.2.1" }, "devDependencies": { "@types/node": "^22.7.4", diff --git a/src/cli/splice.ts b/src/cli/splice.ts index 0c5bb4c..cf86211 100644 --- a/src/cli/splice.ts +++ b/src/cli/splice.ts @@ -13,6 +13,7 @@ import { fileURLToPath } from "node:url"; import { CLIOptions, parseArgs, makeLogger, usage } from "../core/types"; import { detectTwitterArchive, ingestTwitter } from "../sources/twitter"; +// Glowfic support is loaded dynamically when needed to avoid ESM/undici issues on Node 18 import { applyFilters, indexById, @@ -118,6 +119,15 @@ async function main() { "--status", "--ids", "--ids-file", + "--glowfic", + "--glowfic-url", + "--glowfic-urls", + "--glowfic-board", + "--all-characters", + "--min-posts", + "--assistant", + "--assistant-regex", + "--assistant-re", "--", ]); const unknown = argv.filter( @@ -153,17 +163,149 @@ async function main() { } } - if (!opts.source || !opts.out) { + if ( + (!opts.source && !(opts.glowfic && opts.glowfic.length > 0) && !opts.glowficBoard) || + !opts.out + ) { process.stderr.write(usage() + "\n"); process.exit(2); } - const source = path.resolve(opts.source); + const source = opts.source ? path.resolve(opts.source) : ""; const outDir = path.resolve(opts.out); const workspaceDir = path.resolve( opts.workspace || path.join(outDir, ".splice"), ); + // Glowfic multi-character board export (when --glowfic-board provided) + if (opts.glowficBoard) { + try { + logger("info", `Fetching Glowfic board: ${opts.glowficBoard}`); + + // Lazy-load Glowfic support + const { + fetchGlowficThreads, + segmentBoardByAllCharacters, + extractUniqueCharacters, + } = await import("../sources/glowfic"); + const { writeHuggingFaceDataset } = await import("../outputs/hf-dataset"); + + // Fetch all threads from the board (single request) + const threads = await fetchGlowficThreads(opts.glowficBoard, logger, { + markdown: true, + }); + logger("info", `Fetched ${threads.length} thread(s)`); + + // Extract and log character stats + const allChars = extractUniqueCharacters(threads); + logger("info", `Found ${allChars.length} unique character(s)`); + logger("info", `Characters with ≥${opts.minPosts} posts: ${allChars.filter(c => c.postCount >= opts.minPosts).length}`); + + // Segment by all characters + const results = segmentBoardByAllCharacters(threads, { + minPosts: opts.minPosts, + markdown: true, + }); + logger("info", `Generated datasets for ${results.length} character(s)`); + + // Extract source name from URL + const boardUrl = new URL(opts.glowficBoard); + const boardId = boardUrl.pathname.split("/").pop() || "board"; + const sourceName = `Glowfic Board ${boardId}`; + + // Write HuggingFace dataset + const { characterCount, conversationCount } = await writeHuggingFaceDataset( + results, + { + outDir, + sourceName, + sourceUrl: opts.glowficBoard, + dryRun: opts.dryRun, + logger, + }, + ); + + logger("info", `Exported ${conversationCount} conversations across ${characterCount} characters`); + logger("info", opts.dryRun ? "Dry run complete." : "Done."); + process.exit(0); + } catch (e) { + logger("error", (e as Error).message); + process.exit(1); + } + } + + // Glowfic pipeline (when --glowfic provided) + if (opts.glowfic && opts.glowfic.length > 0) { + // Build assistant matcher: prefer explicit regex, else substring match for --assistant + let re: RegExp | null = null; + if (opts.assistantRegex && opts.assistantRegex.length) { + try { + re = new RegExp(opts.assistantRegex, "i"); + } catch (e) { + logger("error", `Invalid --assistant-regex: ${(e as Error).message}`); + process.exit(2); + } + } else if (opts.assistant && opts.assistant.length) { + try { + re = new RegExp(opts.assistant, "i"); + } catch (e) { + logger("error", `Invalid --assistant: ${(e as Error).message}`); + process.exit(2); + } + } + if (!re) { + logger( + "error", + "When using --glowfic, you must provide --assistant or --assistant-regex", + ); + process.exit(2); + } + try { + // Lazy-load Glowfic support to avoid undici import on Node 18 when not used + const { conversationsFromGlowficUrls } = await import( + "../sources/glowfic" + ); + const convs = await conversationsFromGlowficUrls( + opts.glowfic, + { displayName: re, handle: re, author: re } as any, + logger, + { + markdown: true, + mergeConsecutive: true, + trimToLastAssistant: true, + }, + ); + const outPath = path.join(outDir, "conversations_oai.jsonl"); + if (opts.dryRun) { + logger( + "info", + `(dry-run) would write ${convs.length} segmented conversation(s) to ${outPath}`, + ); + } else { + await fs.mkdir(path.dirname(outPath), { recursive: true }); + const fh = await fs.open(outPath, "w"); + const systemMessage = + process.env.SPLICE_SYSTEM_MESSAGE ?? opts.systemMessage; + for (const { messages } of convs) { + if (!messages.length) continue; + const record = { + messages: [{ role: "system", content: systemMessage }, ...messages], + }; + await fh.write(JSON.stringify(record) + "\n"); + } + await fh.close(); + logger( + "info", + `Wrote ${convs.length} segmented conversation(s) to ${outPath}`, + ); + } + logger("info", opts.dryRun ? "Dry run complete." : "Done."); + process.exit(0); + } catch (e) { + logger("error", (e as Error).message); + process.exit(1); + } + } const detected = await detectTwitterArchive(source); if (!detected) { logger( diff --git a/src/core/types.ts b/src/core/types.ts index 90aad3f..acc4f9c 100644 --- a/src/core/types.ts +++ b/src/core/types.ts @@ -10,7 +10,11 @@ import * as path from "node:path"; export type Level = "debug" | "info" | "warn" | "error"; -export type SourceId = "twitter:tweet" | "twitter:like" | string; +export type SourceId = + | "twitter:tweet" + | "twitter:like" + | "glowfic:post" + | string; export interface MediaAttachment { id: string; @@ -84,6 +88,14 @@ export type CLIOptions = { idsFile?: string; // outputs statsJson: boolean; + // glowfic + glowfic?: string[]; // one or more Glowfic URLs (thread/section/board) + assistant?: string; // case-insensitive match on character display name/handle/author + assistantRegex?: string; // regex (JS) on display name/handle/author + // glowfic multi-character export + glowficBoard?: string; // single board URL for multi-character export + allCharacters: boolean; // export for all characters + minPosts: number; // minimum posts for character inclusion }; export const DEFAULT_SYSTEM_MESSAGE = "You have been uploaded to the internet"; @@ -111,6 +123,14 @@ export function parseArgs(argv: string[]): CLIOptions { statsJson: false, workspace: undefined, checkpoint: undefined, + // glowfic + glowfic: [], + assistant: undefined, + assistantRegex: undefined, + // glowfic multi-character export + glowficBoard: undefined, + allCharacters: false, + minPosts: 10, }; const args = argv.slice(2); @@ -174,6 +194,27 @@ export function parseArgs(argv: string[]): CLIOptions { opts.onlyThreads = true; } else if (a === "--with-media") { opts.withMedia = true; + } else if ( + a === "--glowfic" || + a === "--glowfic-url" || + a === "--glowfic-urls" + ) { + const next = args[++i]; + if (next) { + const parts = next.split(",").filter(Boolean); + if (parts.length > 1) opts.glowfic = parts; + else { + const list = [next]; + while (args[i + 1] && !args[i + 1].startsWith("-")) { + list.push(args[++i]); + } + opts.glowfic = list; + } + } + } else if (a === "--assistant") { + opts.assistant = args[++i]; + } else if (a === "--assistant-regex" || a === "--assistant-re") { + opts.assistantRegex = args[++i]; } else if (a === "--stats-json") { opts.statsJson = true; } else if (a === "--decisions-import" || a === "--decisions-file") { @@ -195,6 +236,13 @@ export function parseArgs(argv: string[]): CLIOptions { } } else if (a === "--ids-file") { opts.idsFile = args[++i]; + } else if (a === "--glowfic-board") { + opts.glowficBoard = args[++i]; + } else if (a === "--all-characters") { + opts.allCharacters = true; + } else if (a === "--min-posts") { + const v = parseInt(args[++i] ?? "", 10); + if (!Number.isNaN(v)) opts.minPosts = v; } else if (a === "--") { break; } else if (a.startsWith("-")) { @@ -211,7 +259,7 @@ export function parseArgs(argv: string[]): CLIOptions { export function usage(): string { return [ - "splice — convert a Twitter archive to Markdown, OAI JSONL, and/or JSON", + "splice — convert a Twitter archive or Glowfic URLs to Markdown, OAI JSONL, and/or JSON", "", "Usage:", " splice --source --out [--format markdown oai json sharegpt] [--system-message ]", @@ -237,6 +285,12 @@ export function usage(): string { " --verbose Debug logging", " --version, -V Show version", " --help, -h Show help", + " --glowfic One or more Glowfic URLs (thread, section, or board)", + " --assistant Assistant selector (case-insensitive match on character display name, handle, or author)", + " --assistant-regex Assistant selector regex (JavaScript), tested on display name, handle, or author", + " --glowfic-board Single board URL for multi-character export", + " --all-characters Export datasets for all characters (with --glowfic-board)", + " --min-posts Minimum posts for character inclusion (default: 10)", "", "Examples:", " splice --source ./archive --out ./out --format markdown oai json", @@ -244,6 +298,9 @@ export function usage(): string { " splice --source ./archive --out ./out --since 2024-01-01 --only-threads", " splice --source ./archive --out ./out --json-stdout", " splice --version", + " splice --glowfic https://glowfic.com/posts/5506 --out ./out --format oai --assistant carissa", + ' splice --glowfic https://glowfic.com/boards/215 --out ./out --format oai --assistant-regex "carissa"', + " splice --glowfic-board https://glowfic.com/boards/215 --out ./out --all-characters --min-posts 20", "", "Docs: https://github.com/deepfates/splice • Context: https://deepfates.com/convert-your-twitter-archive-into-training-data", ].join("\n"); diff --git a/src/externals/glowfic-dl.d.ts b/src/externals/glowfic-dl.d.ts new file mode 100644 index 0000000..c44af5a --- /dev/null +++ b/src/externals/glowfic-dl.d.ts @@ -0,0 +1,92 @@ +declare module "glowfic-dl" { + // Types mirrored from the glowfic-dl README/source (minimal surface) + export type Post = { + post_id: string; + author: string | null; + character_display_name: string | null; + character_handle: string | null; + icon_url: string | null; + timestamp: string | null; + content: string; + }; + + export type Thread = { + id: string; + title: string; + url: string; + description: string | null; + posts: Post[]; + authors: string[]; + created_at?: string | null; + updated_at?: string | null; + }; + + export type Section = { + id: string; + title: string | null; + description: string | null; + threads: Thread[]; + }; + + export type Board = { + id: string; + title: string; + description: string | null; + sections: Section[]; + threads: Thread[]; + }; + + export type BookStructure = + | { kind: "thread"; thread: Thread } + | { kind: "section"; section: Section } + | { kind: "board"; board: Board }; + + export const GLOWFIC_ROOT: string; + + // Fetchers + export function fetchThread(url: string): Promise; + export function fetchSection(url: string): Promise
; + export function fetchBoard(url: string): Promise; + export function fetchStructure(url: string): Promise; + + // HTML/Markdown transforms + export function htmlToMarkdown(html: string, options?: { + baseUrl?: string; + absoluteUrls?: boolean; + headingStyle?: "setext" | "atx"; + bulletListMarker?: "-" | "*" | "+"; + keepUnknownInlineHtml?: boolean; + }): string; + + export function postToMarkdown(p: Post, options?: { + baseUrl?: string; + absoluteUrls?: boolean; + headingStyle?: "setext" | "atx"; + bulletListMarker?: "-" | "*" | "+"; + keepUnknownInlineHtml?: boolean; + }): Post; + + export function threadToMarkdown(t: Thread, options?: { + baseUrl?: string; + absoluteUrls?: boolean; + headingStyle?: "setext" | "atx"; + bulletListMarker?: "-" | "*" | "+"; + keepUnknownInlineHtml?: boolean; + }): Thread; + + export function sectionToMarkdown(s: Section, options?: { + baseUrl?: string; + absoluteUrls?: boolean; + headingStyle?: "setext" | "atx"; + bulletListMarker?: "-" | "*" | "+"; + keepUnknownInlineHtml?: boolean; + }): Section; + + export function boardToMarkdown(b: Board, options?: { + baseUrl?: string; + absoluteUrls?: boolean; + headingStyle?: "setext" | "atx"; + bulletListMarker?: "-" | "*" | "+"; + keepUnknownInlineHtml?: boolean; + }): Board; +} diff --git a/src/index.ts b/src/index.ts index 132eed5..17de835 100644 --- a/src/index.ts +++ b/src/index.ts @@ -16,8 +16,9 @@ // Re-export shared types, args, logger, and utilities export * from "./core/types"; -// Re-export built-in Source(s) +// Re-export built-in Sources export * from "./sources/twitter"; +export * from "./sources/glowfic"; // Re-export built-in Transforms export * from "./transforms/core"; diff --git a/src/outputs/hf-dataset.ts b/src/outputs/hf-dataset.ts new file mode 100644 index 0000000..16ca05e --- /dev/null +++ b/src/outputs/hf-dataset.ts @@ -0,0 +1,342 @@ +import * as fs from "node:fs/promises"; +import * as path from "node:path"; +import type { Level, ChatMessage } from "../core/types"; +import type { GlowficCharacter, MultiCharacterResult } from "../sources/glowfic"; + +/** + * Ensure a directory exists (mkdir -p). + */ +async function ensureDir(p: string) { + await fs.mkdir(p, { recursive: true }); +} + +/** + * Sanitize a character ID for use as a directory name. + */ +function sanitizeCharacterId(id: string): string { + return id + .toLowerCase() + .replace(/[^a-z0-9_-]/g, "_") + .replace(/_+/g, "_") + .replace(/^_|_$/g, "") + .slice(0, 64) || "unknown"; +} + +/** + * Options for HuggingFace dataset writing. + */ +export interface HuggingFaceDatasetOptions { + /** Output directory */ + outDir: string; + /** Board/source name for metadata */ + sourceName: string; + /** Source URL */ + sourceUrl: string; + /** Dry run mode */ + dryRun: boolean; + /** Logger function */ + logger: (l: Level, m: string) => void; +} + +/** + * Metadata for a single character's dataset. + */ +export interface CharacterDatasetMeta { + character_id: string; + character_handle: string | null; + character_display_name: string | null; + author: string | null; + post_count: number; + conversation_count: number; + message_count: number; + source: string; + source_url: string; + created_at: string; +} + +/** + * Generate a simple system prompt for a character. + * Format: "You are {displayName} ({handle})." or just "You are {displayName}." + */ +function generateCharacterSystemPrompt(character: GlowficCharacter): string { + const name = character.displayName || character.id; + // Include handle as epithet if it differs from display name + if (character.handle && character.handle !== character.displayName) { + return `You are ${name} (${character.handle}).`; + } + return `You are ${name}.`; +} + +/** + * Write a single character's dataset as OpenAI JSONL. + */ +async function writeCharacterDataset( + result: MultiCharacterResult, + charDir: string, + sourceName: string, + sourceUrl: string, + logger: (l: Level, m: string) => void, + dryRun: boolean, +): Promise { + // Generate character-specific system prompt + const systemMessage = generateCharacterSystemPrompt(result.character); + const trainPath = path.join(charDir, "train.jsonl"); + const metaPath = path.join(charDir, "metadata.json"); + + if (dryRun) { + logger("info", `(dry-run) would write ${result.conversations.length} conversations to ${trainPath}`); + } else { + await ensureDir(charDir); + + // Write JSONL + const fh = await fs.open(trainPath, "w"); + for (const msgs of result.conversations) { + const record = { + messages: [ + { role: "system", content: systemMessage }, + ...msgs, + ], + }; + await fh.write(JSON.stringify(record) + "\n"); + } + await fh.close(); + } + + const meta: CharacterDatasetMeta = { + character_id: result.character.id, + character_handle: result.character.handle, + character_display_name: result.character.displayName, + author: result.character.author, + post_count: result.character.postCount, + conversation_count: result.conversations.length, + message_count: result.messageCount, + source: sourceName, + source_url: sourceUrl, + created_at: new Date().toISOString(), + }; + + if (!dryRun) { + await fs.writeFile(metaPath, JSON.stringify(meta, null, 2), "utf8"); + } + + return meta; +} + +/** + * Generate a HuggingFace dataset card (README.md). + */ +function generateDatasetCard( + sourceName: string, + sourceUrl: string, + characterMetas: CharacterDatasetMeta[], +): string { + const totalConversations = characterMetas.reduce((a, c) => a + c.conversation_count, 0); + const totalMessages = characterMetas.reduce((a, c) => a + c.message_count, 0); + + const characterTable = characterMetas + .slice(0, 20) // Top 20 + .map(c => `| ${c.character_display_name || c.character_id} | ${c.conversation_count} | ${c.message_count} |`) + .join("\n"); + + return `--- +license: cc-by-4.0 +task_categories: + - conversational + - text-generation +language: + - en +tags: + - roleplay + - fiction + - glowfic + - chat +size_categories: + - 1K/ +│ ├── train.jsonl # Conversation data +│ └── metadata.json # Character metadata +└── ... +\`\`\` + +## Usage + +\`\`\`python +from datasets import load_dataset + +# Load a specific character +ds = load_dataset("json", data_files="characters/keltham/train.jsonl") + +# Or load all characters +ds = load_dataset("json", data_files="characters/*/train.jsonl") +\`\`\` + +## License + +This dataset is provided under CC-BY-4.0. Original content from ${sourceUrl}. + +## Source + +Extracted on ${new Date().toISOString().split("T")[0]} using splice multi-character export. +`; +} + +/** + * Generate dataset_info.json for HuggingFace. + */ +function generateDatasetInfo( + sourceName: string, + sourceUrl: string, + characterMetas: CharacterDatasetMeta[], +): object { + return { + description: `Character conversations extracted from ${sourceName}`, + citation: "", + homepage: sourceUrl, + license: "cc-by-4.0", + features: { + messages: { + feature: { + role: { dtype: "string", _type: "Value" }, + content: { dtype: "string", _type: "Value" }, + }, + _type: "Sequence", + }, + }, + splits: { + train: { + name: "train", + num_examples: characterMetas.reduce((a, c) => a + c.conversation_count, 0), + }, + }, + download_size: 0, + dataset_size: 0, + }; +} + +/** + * Write a complete HuggingFace-compatible dataset structure. + */ +export async function writeHuggingFaceDataset( + results: MultiCharacterResult[], + opts: HuggingFaceDatasetOptions, +): Promise<{ characterCount: number; conversationCount: number }> { + const { outDir, sourceName, sourceUrl, dryRun, logger } = opts; + const charactersDir = path.join(outDir, "characters"); + + logger("info", `Writing HuggingFace dataset for ${results.length} characters`); + + if (!dryRun) { + await ensureDir(charactersDir); + } + + const characterMetas: CharacterDatasetMeta[] = []; + let totalConversations = 0; + + for (const result of results) { + const charDirName = sanitizeCharacterId(result.character.id); + const charDir = path.join(charactersDir, charDirName); + + const meta = await writeCharacterDataset( + result, + charDir, + sourceName, + sourceUrl, + logger, + dryRun, + ); + characterMetas.push(meta); + totalConversations += result.conversations.length; + + logger("info", ` ${result.character.displayName || result.character.id}: ${result.conversations.length} conversations`); + } + + // Write combined train.jsonl (all characters) + const combinedPath = path.join(outDir, "train.jsonl"); + if (dryRun) { + logger("info", `(dry-run) would write combined dataset to ${combinedPath}`); + } else { + const fh = await fs.open(combinedPath, "w"); + for (const result of results) { + const charSystemMessage = generateCharacterSystemPrompt(result.character); + for (const msgs of result.conversations) { + const record = { + messages: [ + { role: "system", content: charSystemMessage }, + ...msgs, + ], + }; + await fh.write(JSON.stringify(record) + "\n"); + } + } + await fh.close(); + logger("info", `Wrote combined dataset to ${combinedPath}`); + } + + // Write README.md (dataset card) + const readmePath = path.join(outDir, "README.md"); + if (!dryRun) { + const readme = generateDatasetCard(sourceName, sourceUrl, characterMetas); + await fs.writeFile(readmePath, readme, "utf8"); + logger("info", `Wrote dataset card to ${readmePath}`); + } + + // Write dataset_info.json + const infoPath = path.join(outDir, "dataset_info.json"); + if (!dryRun) { + const info = generateDatasetInfo(sourceName, sourceUrl, characterMetas); + await fs.writeFile(infoPath, JSON.stringify(info, null, 2), "utf8"); + logger("info", `Wrote dataset info to ${infoPath}`); + } + + // Write characters manifest + const manifestPath = path.join(outDir, "characters.json"); + if (!dryRun) { + await fs.writeFile(manifestPath, JSON.stringify(characterMetas, null, 2), "utf8"); + logger("info", `Wrote characters manifest to ${manifestPath}`); + } + + return { + characterCount: results.length, + conversationCount: totalConversations, + }; +} diff --git a/src/outputs/writers.ts b/src/outputs/writers.ts index 1b41376..f17b517 100644 --- a/src/outputs/writers.ts +++ b/src/outputs/writers.ts @@ -1,5 +1,6 @@ import * as fs from "node:fs/promises"; import * as path from "node:path"; +import * as os from "node:os"; import { ContentItem, Thread, @@ -17,33 +18,66 @@ async function ensureDir(p: string) { await fs.mkdir(p, { recursive: true }); } +const DEFAULT_COPY_CONCURRENCY = Math.max( + 2, + Math.min( + 32, + typeof os.availableParallelism === "function" + ? os.availableParallelism() + : 8 + ) +); + /** * Copy media attachments for a set of items into imagesDir, prefixing names with "_". * If an attachment lacks absPath, it will be skipped with a warning. + * Copies are performed with bounded concurrency to speed up large archives. */ async function copyMedia( items: ContentItem[], imagesDir: string, - logger: (l: Level, m: string) => void, + logger: (l: Level, m: string) => void ) { await ensureDir(imagesDir); + + const copies: Array<{ src: string; dest: string }> = []; for (const it of items) { for (const m of it.media ?? []) { - const base = m.absPath ? path.basename(m.absPath) : `${m.id}.bin`; + if (!m.absPath) { + logger("warn", `No absPath for media ${m.id}; skipping copy`); + continue; + } + const base = path.basename(m.absPath); + copies.push({ src: m.absPath, dest: path.join(imagesDir, `_${base}`) }); + } + } + + if (!copies.length) return; + + const parsedEnv = Number.parseInt( + process.env.SPLICE_MEDIA_CONCURRENCY ?? "", + 10 + ); + const concurrency = + Number.isFinite(parsedEnv) && parsedEnv > 0 + ? parsedEnv + : DEFAULT_COPY_CONCURRENCY; + + let next = 0; + const worker = async () => { + while (true) { + const idx = next++; + if (idx >= copies.length) break; + const { src, dest } = copies[idx]; try { - if (!m.absPath) { - logger("warn", `No absPath for media ${m.id}; skipping copy`); - continue; - } - await fs.copyFile(m.absPath, path.join(imagesDir, `_${base}`)); + await fs.copyFile(src, dest); } catch (e) { - logger( - "warn", - `Failed to copy media ${m.absPath ?? m.id}: ${(e as Error).message}`, - ); + logger("warn", `Failed to copy media ${src}: ${(e as Error).message}`); } } - } + }; + + await Promise.all(Array.from({ length: concurrency }, worker)); } /** @@ -83,7 +117,7 @@ function isolateQuotedTweetLinks(text: string): string { /** * Write Markdown outputs: - * - threads/<yyyymmdd>-thread-<slug>.md with frontmatter, cleaned text, media links, and link to Twitter + * - threads/<yyyymmdd>/<slug>.md with frontmatter, cleaned text, media links, and link to Twitter * - tweets/<yyyymmdd>-tweet-<slug>.md for non-thread tweets (excluding RTs) * - images/_<file> copied for referenced items */ @@ -142,7 +176,9 @@ export async function writeMarkdown( const ymd = date.replace(/-/g, ""); const filePath = path.join(threadsDir, `${ymd}/${name}.md`); const topLink = `https://twitter.com/i/web/status/${first.id}`; - const body = `${fm}\n${parts.join("\n\n")}\n\n[View on Twitter](${topLink})`; + const body = `${fm}\n${parts.join( + "\n\n" + )}\n\n[View on Twitter](${topLink})`; if (dryRun) { logger("info", `(dry-run) would write thread file: ${filePath}`); diff --git a/src/sources/glowfic.ts b/src/sources/glowfic.ts new file mode 100644 index 0000000..e025300 --- /dev/null +++ b/src/sources/glowfic.ts @@ -0,0 +1,596 @@ +/** + * Glowfic source adapter and helpers + * + * - Uses the glowfic-dl library to fetch Threads/Sections/Boards from glowfic.com + * - Normalizes posts to ContentItem[] + * - Provides helpers to build ChatMessage conversations by selecting one character + * (by display name, handle, or author) as the "assistant" and others as "user" + * + * Notes: + * - Requires the "glowfic-dl" package to be installed in your project. + * npm i glowfic-dl + */ + +import type { + Level, + ContentItem, + Thread as NormalizedThread, + ChatMessage, + Role, +} from "../core/types"; +import { toIso } from "../core/types"; + +import { + fetchStructure, + threadToMarkdown as glowThreadToMarkdown, + htmlToMarkdown as glowHtmlToMarkdown, + type Thread as GlowThread, + type Section as GlowSection, + type Board as GlowBoard, + type Post as GlowPost, + type BookStructure, +} from "glowfic-dl"; + +/* --------------------------------- Detect --------------------------------- */ + +/** + * Quick URL detection for Glowfic resources. + * Matches posts, board sections, and boards. + */ +export function detectGlowficUri(pathOrUri: string): boolean { + try { + const u = new URL(pathOrUri); + if (!/(\.|^)glowfic\.com$/i.test(u.hostname)) return false; + return /\/(posts|board_sections|boards)\//.test(u.pathname); + } catch { + // Not a URL; allow bare paths that look like glowfic routes + return /glowfic\.com\/(posts|board_sections|boards)\//.test(pathOrUri); + } +} + +/* ------------------------------ Fetch helpers ----------------------------- */ + +function threadsFromStructure(struct: BookStructure): GlowThread[] { + if (struct.kind === "thread") return [struct.thread]; + if (struct.kind === "section") return struct.section.threads; + if (struct.kind === "board") return struct.board.threads; + return []; +} + +/** + * Fetch all threads reachable from a Glowfic URL (thread/section/board). + * Optionally convert HTML content to Markdown for easier downstream use. + */ +export async function fetchGlowficThreads( + url: string, + logger: (l: Level, m: string) => void = () => {}, + options?: { markdown?: boolean }, +): Promise { + logger("info", `Fetching Glowfic: ${url}`); + const struct = await fetchStructure(url); + let threads = threadsFromStructure(struct); + if (options?.markdown !== false) { + threads = threads.map((t) => glowThreadToMarkdown(t)); + } + logger("info", `Fetched ${threads.length} thread(s) from ${url}`); + return threads; +} + +/** + * Fetch threads from multiple URLs, flattening into a single list. + */ +export async function fetchGlowficThreadsMany( + urls: string[], + logger: (l: Level, m: string) => void = () => {}, + options?: { markdown?: boolean; concurrency?: number }, +): Promise { + const conc = Math.max(1, Math.min(options?.concurrency ?? 4, 16)); + const out: GlowThread[] = []; + const pending: Promise[] = []; + let i = 0; + + async function worker() { + while (i < urls.length) { + const idx = i++; + const u = urls[idx]; + try { + const ts = await fetchGlowficThreads(u, logger, { + markdown: options?.markdown, + }); + out.push(...ts); + } catch (err) { + logger("warn", `Failed to fetch ${u}: ${(err as Error).message}`); + } + } + } + + for (let k = 0; k < conc; k++) pending.push(worker()); + await Promise.all(pending); + return out; +} + +/* -------------------------- Normalization to Items ------------------------- */ + +/** + * Normalize a Glowfic post to a generic ContentItem. + * - id: derived from post_id if present, otherwise from index. + * - text: Markdown (recommended) or raw HTML stripped to text via glowfic-dl. + * - createdAt: ISO if available, else now. + * - source: "glowfic:post" + * - raw: original Glowfic post shape + */ +export function normalizeGlowficPost( + thread: GlowThread, + post: GlowPost, + indexInThread: number, + options?: { markdown?: boolean }, +): ContentItem { + const id = + post.post_id && post.post_id.length + ? `${thread.id}:${post.post_id}` + : `${thread.id}:idx-${indexInThread}`; + + let text = post.content ?? ""; + if (options?.markdown !== false) { + // glowfic-dl post.content is the inner HTML of the post; convert to Markdown for training + text = glowHtmlToMarkdown(post.content ?? ""); + } + + const createdAt = post.timestamp + ? toIso(post.timestamp) + : new Date().toISOString(); + return { + id, + text, + createdAt, + parentId: null, + inReplyToUserId: null, + accountId: null, + source: "glowfic:post", + raw: { + thread_id: thread.id, + thread_title: thread.title, + url: thread.url, + post, + }, + }; +} + +/** + * Convert a Glowfic thread into a normalized Thread (ContentItem[]) object. + */ +export function normalizeGlowficThread( + thread: GlowThread, + options?: { markdown?: boolean }, +): NormalizedThread { + const items = thread.posts.map((p, i) => + normalizeGlowficPost(thread, p, i, options), + ); + return { + id: thread.id, + items, + }; +} + +/** + * Normalize many Glowfic threads into ContentItems. + */ +export function normalizeGlowficThreadsToItems( + threads: GlowThread[], + options?: { markdown?: boolean }, +): ContentItem[] { + const out: ContentItem[] = []; + for (const t of threads) { + for (let i = 0; i < t.posts.length; i++) { + out.push(normalizeGlowficPost(t, t.posts[i]!, i, options)); + } + } + return out; +} + +/* -------------------------- Conversation generation ------------------------ */ + +export function segmentedConversationsFromGlowficThread( + thread: GlowThread, + assistant: AssistantMatcher, + options?: ConversationOptions, +): ChatMessage[][] { + const markdown = options?.markdown !== false; + + // Map posts to role + content + const msgs = (thread.posts || []) + .map((p) => { + const role: Role = isAssistantPost(p, assistant) ? "assistant" : "user"; + const content = markdown + ? glowHtmlToMarkdown(p.content ?? "") + : (p.content ?? ""); + const c = (content || "").trim(); + if (!c) return null; + return { role, content: c } as ChatMessage; + }) + .filter(Boolean) as ChatMessage[]; + + const conversations: ChatMessage[][] = []; + let userBuf: string[] = []; + let asstBuf: string[] = []; + let seenAnyUser = false; + + const flushIfComplete = () => { + if (userBuf.length > 0 && asstBuf.length > 0) { + const userMsg: ChatMessage = { + role: "user", + content: userBuf.join("\n\n").trim(), + }; + const asstMsg: ChatMessage = { + role: "assistant", + content: asstBuf.join("\n\n").trim(), + }; + conversations.push([userMsg, asstMsg]); + userBuf = []; + asstBuf = []; + seenAnyUser = false; + } + }; + + for (const m of msgs) { + if (m.role === "user") { + // If we already accumulated an assistant block, that segment is complete; flush and start new. + if (asstBuf.length > 0) { + flushIfComplete(); + } + userBuf.push(m.content); + seenAnyUser = true; + } else { + // assistant + if (!seenAnyUser) { + // Leading assistant before any user: ignore (do not start a segment until users appear) + continue; + } + asstBuf.push(m.content); + } + } + + // Finalize trailing segment if it ends with assistant + flushIfComplete(); + + return conversations; +} + +/** + * How to decide which posts are the "assistant". + * - You can pass a string (matched against display name or handle, case-insensitive). + * - Or a RegExp on display name/handle/author. + * - Or a predicate function that receives the raw GlowPost. + */ +export type AssistantMatcher = + | string + | { + displayName?: string | RegExp; + handle?: string | RegExp; + author?: string | RegExp; + } + | ((post: GlowPost) => boolean); + +/** + * Returns true if a glowfic Post should be considered assistant according to the matcher. + */ +export function isAssistantPost( + post: GlowPost, + matcher: AssistantMatcher, +): boolean { + const display = (post.character_display_name || "").trim(); + const handle = (post.character_handle || "").trim(); + const author = (post.author || "").trim(); + + // Predicate + if (typeof matcher === "function") return !!matcher(post); + + // String: match display name or handle case-insensitive + if (typeof matcher === "string") { + const needle = matcher.trim().toLowerCase(); + return ( + (display && display.toLowerCase() === needle) || + (handle && handle.toLowerCase() === needle) + ); + } + + // Object form + const matchStr = (val: string | null, target?: string | RegExp): boolean => { + if (!target) return false; + if (!val) return false; + if (typeof target === "string") + return val.toLowerCase() === target.toLowerCase(); + try { + return target.test(val); + } catch { + return false; + } + }; + + return ( + matchStr(display, matcher.displayName) || + matchStr(handle, matcher.handle) || + matchStr(author, matcher.author) + ); +} + +export type ConversationOptions = { + markdown?: boolean; // default true + mergeConsecutive?: boolean; // merge adjacent messages from same role + trimToLastAssistant?: boolean; // drop trailing user tail if assistant appears earlier +}; + +/** + * Convert a single Glowfic thread into messages with the chosen assistant character. + */ +export function conversationFromGlowficThread( + thread: GlowThread, + assistant: AssistantMatcher, + options?: ConversationOptions, +): ChatMessage[] { + const markdown = options?.markdown !== false; + const mergeConsecutive = options?.mergeConsecutive !== false; // default true + const trimToLastAssistant = options?.trimToLastAssistant !== false; // default true + + type P = GlowPost; + const posts: P[] = thread.posts || []; + + const messagesRaw: ChatMessage[] = posts + .map((p) => { + const role: Role = isAssistantPost(p, assistant) ? "assistant" : "user"; + const content = markdown + ? glowHtmlToMarkdown(p.content ?? "") + : (p.content ?? ""); + const c = (content || "").trim(); + if (!c) return null; + return { role, content: c }; + }) + .filter(Boolean) as ChatMessage[]; + + const msgs = mergeConsecutive ? mergeSameRole(messagesRaw) : messagesRaw; + + if (trimToLastAssistant) { + for (let i = msgs.length - 1; i >= 0; i--) { + if (msgs[i]!.role === "assistant") return msgs.slice(0, i + 1); + } + return []; // if no assistant lines, skip + } + + return msgs; +} + +/** + * Build conversations for each thread captured by the given URL (thread/section/board). + */ +export async function conversationsFromGlowficUrl( + url: string, + assistant: AssistantMatcher, + logger: (l: Level, m: string) => void = () => {}, + options?: ConversationOptions, +): Promise<{ thread: GlowThread; messages: ChatMessage[] }[]> { + const threads = await fetchGlowficThreads(url, logger, { + markdown: options?.markdown, + }); + const out: { thread: GlowThread; messages: ChatMessage[] }[] = []; + for (const t of threads) { + const segments = segmentedConversationsFromGlowficThread( + t, + assistant, + options, + ); + for (const messages of segments) { + if (messages.length > 0) out.push({ thread: t, messages }); + } + } + return out; +} + +/** + * Build conversations across many URLs (flattened). + */ +export async function conversationsFromGlowficUrls( + urls: string[], + assistant: AssistantMatcher, + logger: (l: Level, m: string) => void = () => {}, + options?: ConversationOptions, +): Promise<{ thread: GlowThread; messages: ChatMessage[] }[]> { + const threads = await fetchGlowficThreadsMany(urls, logger, { + markdown: options?.markdown, + }); + const out: { thread: GlowThread; messages: ChatMessage[] }[] = []; + for (const t of threads) { + const segments = segmentedConversationsFromGlowficThread( + t, + assistant, + options, + ); + for (const messages of segments) { + if (messages.length > 0) out.push({ thread: t, messages }); + } + } + return out; +} + +/* -------------------------- Multi-character helpers ------------------------ */ + +/** + * Represents a unique character found in Glowfic threads. + */ +export interface GlowficCharacter { + /** Primary identifier: handle if available, else display name */ + id: string; + handle: string | null; + displayName: string | null; + author: string | null; + postCount: number; +} + +/** + * Extract all unique characters from a set of Glowfic threads. + * Characters are keyed by handle (preferred) or display name. + * Returns sorted by post count descending. + */ +export function extractUniqueCharacters( + threads: GlowThread[], +): GlowficCharacter[] { + const charMap = new Map(); + + for (const thread of threads) { + for (const post of thread.posts || []) { + const handle = (post.character_handle || "").trim(); + const displayName = (post.character_display_name || "").trim(); + const author = (post.author || "").trim(); + + // Use handle as primary key, fall back to display name + const id = handle || displayName || author || "unknown"; + if (!id || id === "unknown") continue; + + const existing = charMap.get(id); + if (existing) { + existing.postCount++; + // Fill in missing fields if available + if (!existing.handle && handle) existing.handle = handle; + if (!existing.displayName && displayName) existing.displayName = displayName; + if (!existing.author && author) existing.author = author; + } else { + charMap.set(id, { + id, + handle: handle || null, + displayName: displayName || null, + author: author || null, + postCount: 1, + }); + } + } + } + + // Sort by post count descending + return Array.from(charMap.values()).sort((a, b) => b.postCount - a.postCount); +} + +/** + * Result of segmenting a board by all characters. + */ +export interface MultiCharacterResult { + character: GlowficCharacter; + /** Array of conversation segments, each is [user, assistant] pairs */ + conversations: ChatMessage[][]; + /** Total message count across all conversations */ + messageCount: number; +} + +/** + * Segment all threads for each unique character as the assistant. + * + * @param threads - Pre-fetched Glowfic threads + * @param options - Configuration options + * @returns Array of results per character, sorted by conversation count + */ +export function segmentBoardByAllCharacters( + threads: GlowThread[], + options?: { + minPosts?: number; + markdown?: boolean; + }, +): MultiCharacterResult[] { + const minPosts = options?.minPosts ?? 10; + const characters = extractUniqueCharacters(threads); + + const results: MultiCharacterResult[] = []; + + for (const char of characters) { + // Skip characters below threshold + if (char.postCount < minPosts) continue; + + // Build a matcher for this character + const matcher: AssistantMatcher = (post) => { + const postHandle = (post.character_handle || "").trim().toLowerCase(); + const postDisplay = (post.character_display_name || "").trim().toLowerCase(); + const charId = char.id.toLowerCase(); + return postHandle === charId || postDisplay === charId; + }; + + // Collect all conversation segments for this character + const conversations: ChatMessage[][] = []; + let messageCount = 0; + + for (const thread of threads) { + const segments = segmentedConversationsFromGlowficThread( + thread, + matcher, + { markdown: options?.markdown !== false }, + ); + for (const msgs of segments) { + if (msgs.length > 0) { + conversations.push(msgs); + messageCount += msgs.length; + } + } + } + + // Only include if we got any conversations + if (conversations.length > 0) { + results.push({ + character: char, + conversations, + messageCount, + }); + } + } + + // Sort by number of conversations descending + return results.sort((a, b) => b.conversations.length - a.conversations.length); +} + +/* ------------------------------- SourceAdapter ---------------------------- */ + +import type { SourceAdapter, Logger } from "../index"; + +/** + * A pluggable SourceAdapter for Glowfic URLs. + * - detect(): true for glowfic.com posts/sections/boards + * - ingest(): returns normalized ContentItem[] flattened from all threads + * + * Note: Use the conversation helpers above if you want ChatMessage dialogs + * with a chosen assistant character. The adapter focuses on generic items. + */ +export const GlowficSourceAdapter: SourceAdapter = { + kind: "glowfic", + async detect(pathOrUri: string): Promise { + return detectGlowficUri(pathOrUri); + }, + async ingest(pathOrUri: string, log: Logger): Promise { + const threads = await fetchGlowficThreads(pathOrUri, (l, m) => log(l, m), { + markdown: true, + }); + return normalizeGlowficThreadsToItems(threads, { markdown: true }); + }, +}; + +/* --------------------------------- utils ---------------------------------- */ + +function mergeSameRole(messages: ChatMessage[]): ChatMessage[] { + if (messages.length === 0) return messages; + const out: ChatMessage[] = []; + let curRole: Role | null = null; + let cur: string[] = []; + const flush = () => { + if (curRole && cur.length) { + const content = cur.join("\n\n").trim(); + if (content) out.push({ role: curRole, content }); + } + curRole = null; + cur = []; + }; + for (const m of messages) { + if (curRole !== m.role) { + flush(); + curRole = m.role; + cur.push(m.content); + } else { + cur.push(m.content); + } + } + flush(); + return out; +} diff --git a/tests/integration/basic.test.ts b/tests/integration/basic.test.ts index 4b40614..d403cce 100644 --- a/tests/integration/basic.test.ts +++ b/tests/integration/basic.test.ts @@ -243,13 +243,19 @@ describe("splice CLI integration", () => { // Check that we have exactly one thread (root + self-reply only) const threadsDir = path.join(tempOut, "threads"); - const threadFiles = await fs.readdir(threadsDir); + // Threads are nested in date subdirectories (YYYYMMDD/) + const threadSubdirs = (await fs.readdir(threadsDir, { withFileTypes: true })) + .filter((d) => d.isDirectory()) + .map((d) => d.name); + expect(threadSubdirs.length).toBe(1); + const threadYmdDir = path.join(threadsDir, threadSubdirs[0]); + const threadFiles = await fs.readdir(threadYmdDir); const mdFiles = threadFiles.filter((f) => f.endsWith(".md")); expect(mdFiles.length).toBe(1); // Read the thread content const threadContent = await fs.readFile( - path.join(threadsDir, mdFiles[0]), + path.join(threadYmdDir, mdFiles[0]), "utf8", ); diff --git a/tests/integration/media.test.ts b/tests/integration/media.test.ts index 8fc5f13..a5c8993 100644 --- a/tests/integration/media.test.ts +++ b/tests/integration/media.test.ts @@ -89,17 +89,23 @@ describe("splice CLI media handling", () => { expect(copiedStat.size).toBeGreaterThan(0); // Find the generated thread markdown and assert the image link is present - const threadFiles = await fs.readdir(threadsDir); + // Threads are nested in date subdirectories (YYYYMMDD/) + const threadSubdirs = (await fs.readdir(threadsDir, { withFileTypes: true })) + .filter((d) => d.isDirectory()) + .map((d) => d.name); + expect(threadSubdirs.length).toBeGreaterThan(0); + const threadYmdDir = path.join(threadsDir, threadSubdirs[0]); + const threadFiles = await fs.readdir(threadYmdDir); const mdFiles = threadFiles.filter((f) => f.endsWith(".md")); expect(mdFiles.length).toBeGreaterThan(0); // Read first thread file - const threadMdPath = path.join(threadsDir, mdFiles[0]); + const threadMdPath = path.join(threadYmdDir, mdFiles[0]); const threadContent = await fs.readFile(threadMdPath, "utf8"); - // Threads are saved under out/threads, so images are linked as ../images/_ + // Threads are saved under out/threads/YYYYMMDD/, so images are linked as ../../images/_ const expectedMdImage = - `![${sourceMediaBasename}](../images/${copiedMediaBasename})`; + `![${sourceMediaBasename}](../../images/${copiedMediaBasename})`; expect(threadContent).toContain(expectedMdImage); }, diff --git a/tsconfig.json b/tsconfig.json index f67c65c..847aff0 100644 --- a/tsconfig.json +++ b/tsconfig.json @@ -15,6 +15,7 @@ "@cli/*": ["src/cli/*"] }, "types": ["node"], + "typeRoots": ["./src/externals", "./node_modules/@types"], "resolveJsonModule": true, "esModuleInterop": true, "allowSyntheticDefaultImports": true,