From 2d6c7ea389903cdf639854395fd2974600760722 Mon Sep 17 00:00:00 2001 From: Lukasz Gandecki Date: Wed, 4 Feb 2026 20:17:33 +0100 Subject: [PATCH 1/4] paragraph counts. restore unwrapped and other fixes --- .gitignore | 2 + apps/pipeline/package.json | 1 + apps/pipeline/src/callGrokAzure.ts | 50 ++++ apps/pipeline/src/callO3.ts | 1 - apps/pipeline/src/helpers/logError.ts | 8 + apps/pipeline/src/lib/domParser.ts | 16 ++ apps/pipeline/src/lib/paragraphCount.ts | 10 + .../src/scripts/fix-legacy-play-chapters.ts | 134 +++++++++ .../src/scripts/fix-non-play-chapters.ts | 107 +++++++ ...nwrapped-paragraphs-in-temporary-output.ts | 134 +++++++++ .../src/scripts/scan-non-html-tags.ts | 226 +++++++++++++++ .../scripts/upload-chapters-source.spec.ts | 14 + .../src/scripts/upload-chapters-source.ts | 62 +++- .../src/scripts/upload-fixed-nonplays.ts | 188 ++++++++++++ .../src/server/backfill-paragraph-counts.ts | 168 +++++++++++ apps/pipeline/src/server/clone-book.ts | 3 + apps/pipeline/src/server/convex-client.ts | 1 + .../src/server/fix-chapters-upload.ts | 3 + apps/pipeline/src/server/pipeline.ts | 3 + .../src/server/regenerate-missing-avatars.ts | 57 ++++ apps/pipeline/src/server/upload-chapters.ts | 69 ----- .../tools/NewRewriteParagraphsPromptBook.md | 55 +++- .../NewRewriteParagraphsPromptBookChunked.md | 55 +++- .../pipeline/src/tools/chapterChunker.spec.ts | 42 +++ apps/pipeline/src/tools/chapterChunker.ts | 2 +- .../tools/fix-legacy-play-custom-tags.spec.ts | 70 +++++ .../src/tools/fix-legacy-play-custom-tags.ts | 168 +++++++++++ .../tools/fix-legacy-play-didaskalia.spec.ts | 94 ++++++ .../src/tools/fix-legacy-play-didaskalia.ts | 120 ++++++++ .../fix-legacy-play-multi-speaker.spec.ts | 39 +++ .../tools/fix-legacy-play-multi-speaker.ts | 85 ++++++ .../fix-legacy-play-stage-directions.spec.ts | 77 +++++ .../tools/fix-legacy-play-stage-directions.ts | 113 ++++++++ .../tools/fix-non-play-custom-tags.spec.ts | 119 ++++++++ .../src/tools/fix-non-play-custom-tags.ts | 268 ++++++++++++++++++ .../getParagraphsFromChapterWithText.spec.ts | 34 +++ .../tools/getParagraphsFromChapterWithText.ts | 2 - .../identifyEntityAndRewriteParagraphs.ts | 27 +- apps/pipeline/src/tools/importScannedBook.ts | 5 +- .../src/tools/importScannedBookIncremental.ts | 3 + .../new-tooling/generate-flux-schnel-image.ts | 44 ++- .../generate-pictures-for-entities.ts | 12 - .../get-chapter-by-chapter-summary.ts | 5 +- ...by-chapter-with-paragraphs-json-summary.ts | 21 +- .../new-tooling/get-chapter-title.spec.ts | 107 +++++++ .../tools/new-tooling/get-chapter-title.ts | 95 ++++++- ...t-reference-cards-for-whole-book-prompt.md | 3 +- .../get-reference-cards-for-whole-book.ts | 12 +- .../restore-unwrapped-blocks.spec.ts | 36 +++ .../new-tooling/restore-unwrapped-blocks.ts | 166 +++++++++++ .../tools/new-tooling/section-wrapper.spec.ts | 37 +++ .../src/tools/new-tooling/section-wrapper.ts | 73 +++++ .../new-tooling/summaryParagraphs.spec.ts | 27 ++ .../tools/new-tooling/summaryParagraphs.ts | 5 + .../src/tools/se-converter/importSEBook.ts | 7 +- apps/pipeline/tsconfig.json | 2 +- .../components/modals/BookChaptersModal.tsx | 5 +- .../src/components/modals/CharacterModal.tsx | 56 +++- .../src/components/modals/SearchModal.tsx | 21 +- .../player/src/helpers/minorCharacterUtils.ts | 24 ++ apps/player/src/locales/en/translation.json | 2 +- apps/player/src/locales/pl/translation.json | 2 +- .../src/services/__tests__/formatB.test.ts | 20 ++ .../services/__tests__/paragraphCount.test.ts | 26 ++ apps/player/src/services/htmlNormalizer.ts | 49 +++- apps/player/src/utils/getChapterTitle.ts | 107 ------- bun.lock | 5 + 67 files changed, 3329 insertions(+), 275 deletions(-) create mode 100644 apps/pipeline/src/callGrokAzure.ts create mode 100644 apps/pipeline/src/helpers/logError.ts create mode 100644 apps/pipeline/src/lib/domParser.ts create mode 100644 apps/pipeline/src/lib/paragraphCount.ts create mode 100644 apps/pipeline/src/scripts/fix-legacy-play-chapters.ts create mode 100644 apps/pipeline/src/scripts/fix-non-play-chapters.ts create mode 100644 apps/pipeline/src/scripts/fix-unwrapped-paragraphs-in-temporary-output.ts create mode 100644 apps/pipeline/src/scripts/scan-non-html-tags.ts create mode 100644 apps/pipeline/src/scripts/upload-chapters-source.spec.ts create mode 100644 apps/pipeline/src/scripts/upload-fixed-nonplays.ts create mode 100644 apps/pipeline/src/server/backfill-paragraph-counts.ts delete mode 100644 apps/pipeline/src/server/upload-chapters.ts create mode 100644 apps/pipeline/src/tools/chapterChunker.spec.ts create mode 100644 apps/pipeline/src/tools/fix-legacy-play-custom-tags.spec.ts create mode 100644 apps/pipeline/src/tools/fix-legacy-play-custom-tags.ts create mode 100644 apps/pipeline/src/tools/fix-legacy-play-didaskalia.spec.ts create mode 100644 apps/pipeline/src/tools/fix-legacy-play-didaskalia.ts create mode 100644 apps/pipeline/src/tools/fix-legacy-play-multi-speaker.spec.ts create mode 100644 apps/pipeline/src/tools/fix-legacy-play-multi-speaker.ts create mode 100644 apps/pipeline/src/tools/fix-legacy-play-stage-directions.spec.ts create mode 100644 apps/pipeline/src/tools/fix-legacy-play-stage-directions.ts create mode 100644 apps/pipeline/src/tools/fix-non-play-custom-tags.spec.ts create mode 100644 apps/pipeline/src/tools/fix-non-play-custom-tags.ts create mode 100644 apps/pipeline/src/tools/getParagraphsFromChapterWithText.spec.ts create mode 100644 apps/pipeline/src/tools/new-tooling/get-chapter-title.spec.ts create mode 100644 apps/pipeline/src/tools/new-tooling/restore-unwrapped-blocks.spec.ts create mode 100644 apps/pipeline/src/tools/new-tooling/restore-unwrapped-blocks.ts create mode 100644 apps/pipeline/src/tools/new-tooling/section-wrapper.spec.ts create mode 100644 apps/pipeline/src/tools/new-tooling/section-wrapper.ts create mode 100644 apps/pipeline/src/tools/new-tooling/summaryParagraphs.spec.ts create mode 100644 apps/pipeline/src/tools/new-tooling/summaryParagraphs.ts create mode 100644 apps/player/src/helpers/minorCharacterUtils.ts create mode 100644 apps/player/src/services/__tests__/paragraphCount.test.ts delete mode 100644 apps/player/src/utils/getChapterTitle.ts diff --git a/.gitignore b/.gitignore index 494a3e8d..3e48028e 100644 --- a/.gitignore +++ b/.gitignore @@ -68,3 +68,5 @@ fastlane/test_output claude-agent-sdk-demos xcuserdata + +apps/pipeline/app.log diff --git a/apps/pipeline/package.json b/apps/pipeline/package.json index ac690da2..2ce1b9c1 100644 --- a/apps/pipeline/package.json +++ b/apps/pipeline/package.json @@ -20,6 +20,7 @@ }, "dependencies": { "@ai-sdk/anthropic": "2.0.38", + "@ai-sdk/azure": "2.0.91", "@ai-sdk/cerebras": "^1.0.20", "@ai-sdk/google": "^2.0.14", "@ai-sdk/groq": "^2.0.21", diff --git a/apps/pipeline/src/callGrokAzure.ts b/apps/pipeline/src/callGrokAzure.ts new file mode 100644 index 00000000..13d29f94 --- /dev/null +++ b/apps/pipeline/src/callGrokAzure.ts @@ -0,0 +1,50 @@ +import OpenAI from "openai"; +import { type z } from "zod"; + +const endpoint = "https://bookgenius.services.ai.azure.com/openai/v1/"; +const model = "grok-4-fast-reasoning"; +const api_key = process.env.AZURE_GROK_KEY; + +const client = new OpenAI({ baseURL: endpoint, apiKey: api_key }); + +export const callGrokAzure = async (prompt: string) => { + const completion = await client.chat.completions.create({ + messages: [{ role: "user", content: prompt }], + model, + }); + + return completion.choices[0].message.content; +}; + +export const callGrokAzureWithSchema = async (prompt: string, zodSchema: z.ZodSchema) => { + const completion = await client.chat.completions.create({ + messages: [{ role: "user", content: prompt }], + model, + response_format: { + type: "json_schema", + json_schema: { + name: "response", + strict: true, + // @ts-expect-error(zod typing) + schema: zodSchema.shape, + }, + }, + }); + let result: T; + try { + result = JSON.parse(completion.choices[0].message.content as string) as T; + } catch (e) { + console.error("Error parsing JSON", e); + throw e; + } + return result; +}; + +// if (require.main === module) { +// const schema = z.object({ name: z.string(), age: z.number() }); +// const prompt = "What is my name? My name is John Doe and I'm 30"; +// const result = await callGrokAzureWithSchema(prompt, schema); +// console.log(result); +// console.log(result.name); +// console.log(result.age); +// } diff --git a/apps/pipeline/src/callO3.ts b/apps/pipeline/src/callO3.ts index 0e498b4e..a14782fc 100644 --- a/apps/pipeline/src/callO3.ts +++ b/apps/pipeline/src/callO3.ts @@ -22,7 +22,6 @@ export const callO3WithSchema = async ( model: openai(model), schema: zodSchema, prompt, - // providerOptions: { google: { thinkingConfig: { thinkingBudget: 0, includeThoughts: true } } }, experimental_telemetry: { isEnabled: true, recordInputs: true, recordOutputs: true }, }); diff --git a/apps/pipeline/src/helpers/logError.ts b/apps/pipeline/src/helpers/logError.ts new file mode 100644 index 00000000..0e62486a --- /dev/null +++ b/apps/pipeline/src/helpers/logError.ts @@ -0,0 +1,8 @@ +export function logError(contextMessage: string, err: unknown) { + if (err instanceof Error) { + console.error(`${contextMessage} ${err.message}`); + console.error(err.stack); + return; + } + console.error(`${contextMessage} ${String(err)}`); +} diff --git a/apps/pipeline/src/lib/domParser.ts b/apps/pipeline/src/lib/domParser.ts new file mode 100644 index 00000000..0f0301e7 --- /dev/null +++ b/apps/pipeline/src/lib/domParser.ts @@ -0,0 +1,16 @@ +import { JSDOM } from "jsdom"; + +let initialized = false; + +export function ensureDomParser(): void { + if (typeof (globalThis as { DOMParser?: unknown }).DOMParser !== "undefined") { + return; + } + if (initialized) { + return; + } + + const { window } = new JSDOM(""); + (globalThis as { DOMParser: typeof window.DOMParser }).DOMParser = window.DOMParser; + initialized = true; +} diff --git a/apps/pipeline/src/lib/paragraphCount.ts b/apps/pipeline/src/lib/paragraphCount.ts new file mode 100644 index 00000000..9cf8bb1c --- /dev/null +++ b/apps/pipeline/src/lib/paragraphCount.ts @@ -0,0 +1,10 @@ +import { + countParagraphsFromChapterHtml, + type ParagraphCountOptions, +} from "@player/services/htmlNormalizer"; +import { ensureDomParser } from "./domParser"; + +export function computeParagraphCount(html: string, options?: ParagraphCountOptions): number { + ensureDomParser(); + return countParagraphsFromChapterHtml(html, options); +} diff --git a/apps/pipeline/src/scripts/fix-legacy-play-chapters.ts b/apps/pipeline/src/scripts/fix-legacy-play-chapters.ts new file mode 100644 index 00000000..1a0f8993 --- /dev/null +++ b/apps/pipeline/src/scripts/fix-legacy-play-chapters.ts @@ -0,0 +1,134 @@ +#!/usr/bin/env bun +import fs from "fs"; +import os from "os"; +import path from "path"; +import { fixLegacyPlayDidaskalia } from "../tools/fix-legacy-play-didaskalia"; +import { fixLegacyPlayCustomTags } from "../tools/fix-legacy-play-custom-tags"; +import { fixLegacyPlayStageDirections } from "../tools/fix-legacy-play-stage-directions"; +import { + applyMultiSpeakerMapToHtml, + extractMultiSpeakerNextLineMapFromXml, +} from "../tools/fix-legacy-play-multi-speaker"; + +type Args = { bookSlug: string; sourceDir: string; outputDir: string; dryRun: boolean }; + +function resolvePath(inputPath: string): string { + if (inputPath.startsWith("~/")) { + return path.join(process.env.HOME ?? "", inputPath.slice(2)); + } + return path.resolve(inputPath); +} + +function parseArgs(): Args { + const args = process.argv.slice(2); + if (args.length < 1) { + console.error( + "Usage: bun apps/pipeline/src/scripts/fix-legacy-play-chapters.ts [--source ] [--out ] [--dry-run]", + ); + process.exit(1); + } + + const bookSlug = args[0]; + const sourceIdx = args.indexOf("--source"); + const outIdx = args.indexOf("--out"); + + const repoRoot = path.resolve(process.cwd()); + const defaultSource = path.join(repoRoot, "ConvexAssets", "books", bookSlug, "chapters-source"); + const defaultOut = path.join(os.tmpdir(), "bookgenius-fixed-chapters", bookSlug); + + return { + bookSlug, + sourceDir: resolvePath(sourceIdx !== -1 ? args[sourceIdx + 1] : defaultSource), + outputDir: resolvePath(outIdx !== -1 ? args[outIdx + 1] : defaultOut), + dryRun: args.includes("--dry-run"), + }; +} + +function isPlayBook(metadataXml: string): boolean { + return /
\s*Play\s*<\/Form>/i.test(metadataXml); +} + +function getChapterNumberFromFilename(filename: string): number | null { + const match = filename.match(/chapter-(\d+)\.html$/i); + if (!match) return null; + return parseInt(match[1], 10); +} + +async function main(): Promise { + const { bookSlug, sourceDir, outputDir, dryRun } = parseArgs(); + const repoRoot = path.resolve(process.cwd()); + const booksContentDir = path.join(repoRoot, "books", bookSlug, "booksContent"); + const metadataPath = path.join(booksContentDir, "metadata.xml"); + + if (!fs.existsSync(metadataPath)) { + console.error(`Missing metadata.xml for ${bookSlug}: ${metadataPath}`); + process.exit(1); + } + + const metadataXml = fs.readFileSync(metadataPath, "utf-8"); + if (!isPlayBook(metadataXml)) { + console.error(`Book ${bookSlug} is not marked as Play.`); + process.exit(1); + } + + if (!fs.existsSync(sourceDir)) { + console.error(`Source directory not found: ${sourceDir}`); + process.exit(1); + } + + const htmlFiles = fs + .readdirSync(sourceDir) + .filter((file) => file.toLowerCase().endsWith(".html")); + + if (htmlFiles.length === 0) { + console.error(`No .html files found in ${sourceDir}`); + process.exit(1); + } + + if (!dryRun) { + fs.mkdirSync(outputDir, { recursive: true }); + } + + let changed = 0; + let processed = 0; + + for (const file of htmlFiles) { + const chapterNumber = getChapterNumberFromFilename(file); + const sourcePath = path.join(sourceDir, file); + const html = fs.readFileSync(sourcePath, "utf-8"); + + let updated = fixLegacyPlayStageDirections(html); + updated = fixLegacyPlayCustomTags(updated); + updated = fixLegacyPlayDidaskalia(updated); + + if (chapterNumber !== null) { + const xmlPath = path.join(booksContentDir, `chapter${chapterNumber}.xml`); + if (fs.existsSync(xmlPath)) { + const xml = fs.readFileSync(xmlPath, "utf-8"); + const map = extractMultiSpeakerNextLineMapFromXml(xml); + updated = applyMultiSpeakerMapToHtml(updated, map); + } + } + + processed += 1; + + if (updated !== html) { + changed += 1; + } + + if (dryRun) { + const note = updated !== html ? " (changed)" : ""; + console.log(`[dry-run] Would write ${file}${note}`); + } else { + const outPath = path.join(outputDir, file); + fs.writeFileSync(outPath, updated, "utf-8"); + } + } + + console.log(`Done. Processed ${processed} chapter(s). Changed ${changed}. Output: ${outputDir}`); +} + +main().catch((error) => { + console.error("Fatal error:", error); + process.exit(1); +}); diff --git a/apps/pipeline/src/scripts/fix-non-play-chapters.ts b/apps/pipeline/src/scripts/fix-non-play-chapters.ts new file mode 100644 index 00000000..ed3056ff --- /dev/null +++ b/apps/pipeline/src/scripts/fix-non-play-chapters.ts @@ -0,0 +1,107 @@ +#!/usr/bin/env bun +import fs from "fs"; +import os from "os"; +import path from "path"; +import { fixNonPlayCustomTags } from "../tools/fix-non-play-custom-tags"; + +type Args = { sourceRoot: string; outputRoot: string; slugs: string[] | null; dryRun: boolean }; + +const PLAY_SLUGS = new Set([ + "Hamlet", + "Macbeth", + "Midsummer-Nights-Dream", + "Othello", + "Romeo-And-Juliet", + "The-Tempest", + "Romeo-And-Juliet-Small", + "Romeo-And-Juliet-Smaller", +]); + +function resolvePath(inputPath: string): string { + if (inputPath.startsWith("~/")) { + return path.join(process.env.HOME ?? "", inputPath.slice(2)); + } + return path.resolve(inputPath); +} + +function parseArgs(): Args { + const args = process.argv.slice(2); + const sourceIdx = args.indexOf("--source"); + const outIdx = args.indexOf("--out"); + const slugsIdx = args.indexOf("--slugs"); + + const repoRoot = path.resolve(process.cwd()); + const defaultSource = path.join(repoRoot, "ConvexAssets", "books"); + const defaultOut = path.join(os.tmpdir(), "bookgenius-fixed-nonplays"); + + const slugs = + slugsIdx !== -1 ? (args[slugsIdx + 1]?.split(",").map((s) => s.trim()) ?? []) : null; + + return { + sourceRoot: resolvePath(sourceIdx !== -1 ? args[sourceIdx + 1] : defaultSource), + outputRoot: resolvePath(outIdx !== -1 ? args[outIdx + 1] : defaultOut), + slugs, + dryRun: args.includes("--dry-run"), + }; +} + +function listSlugs(root: string): string[] { + if (!fs.existsSync(root)) return []; + return fs + .readdirSync(root, { withFileTypes: true }) + .filter((entry) => entry.isDirectory()) + .map((entry) => entry.name); +} + +function listHtmlFiles(dir: string): string[] { + if (!fs.existsSync(dir)) return []; + return fs + .readdirSync(dir) + .filter((file) => file.toLowerCase().endsWith(".html")) + .map((file) => path.join(dir, file)); +} + +async function main(): Promise { + const { sourceRoot, outputRoot, slugs, dryRun } = parseArgs(); + const targetSlugs = slugs ?? listSlugs(sourceRoot); + + let processedBooks = 0; + let processedFiles = 0; + let changedFiles = 0; + + for (const slug of targetSlugs) { + if (PLAY_SLUGS.has(slug)) continue; + + const chaptersDir = path.join(sourceRoot, slug, "chapters-source"); + const files = listHtmlFiles(chaptersDir); + if (files.length === 0) continue; + + processedBooks += 1; + const outDir = path.join(outputRoot, slug, "chapters-source"); + if (!dryRun) { + fs.mkdirSync(outDir, { recursive: true }); + } + + for (const file of files) { + const html = fs.readFileSync(file, "utf-8"); + const updated = fixNonPlayCustomTags(html); + processedFiles += 1; + if (updated !== html) { + changedFiles += 1; + } + if (!dryRun) { + const outPath = path.join(outDir, path.basename(file)); + fs.writeFileSync(outPath, updated, "utf-8"); + } + } + } + + console.log( + `Done. Books: ${processedBooks}, Files: ${processedFiles}, Changed: ${changedFiles}. Output: ${outputRoot}`, + ); +} + +main().catch((error) => { + console.error("Fatal error:", error); + process.exit(1); +}); diff --git a/apps/pipeline/src/scripts/fix-unwrapped-paragraphs-in-temporary-output.ts b/apps/pipeline/src/scripts/fix-unwrapped-paragraphs-in-temporary-output.ts new file mode 100644 index 00000000..fe85f39a --- /dev/null +++ b/apps/pipeline/src/scripts/fix-unwrapped-paragraphs-in-temporary-output.ts @@ -0,0 +1,134 @@ +#!/usr/bin/env bun +import fs from "fs"; +import path from "path"; +import { restoreUnwrappedBlocks } from "../tools/new-tooling/restore-unwrapped-blocks"; +import { buildSectionWrapper, extractSectionInner } from "../tools/new-tooling/section-wrapper"; + +const DEFAULT_OUTPUT_ROOT = + "/var/folders/j9/pbqwg7zs4336w7vccnz2xhcw0000gn/T/bookgenius-fixed-unwrapped"; + +type Args = { sourceRoot: string; outputRoot: string; slugs: string[] }; + +function resolvePath(inputPath: string): string { + if (inputPath.startsWith("~/")) { + return path.join(process.env.HOME ?? "", inputPath.slice(2)); + } + return path.resolve(inputPath); +} + +function parseArgs(): Args { + const args = process.argv.slice(2); + const sourceIdx = args.indexOf("--source"); + const outputIdx = args.indexOf("--out"); + const slugsIdx = args.indexOf("--slugs"); + + const repoRoot = path.resolve(process.cwd()); + const defaultSource = path.join(repoRoot, "apps", "pipeline", "books-data"); + + const sourceRoot = resolvePath(sourceIdx !== -1 ? args[sourceIdx + 1] : defaultSource); + const outputRoot = resolvePath(outputIdx !== -1 ? args[outputIdx + 1] : DEFAULT_OUTPUT_ROOT); + + let slugs: string[] = []; + if (slugsIdx !== -1) { + slugs = args[slugsIdx + 1]?.split(",").map((slug) => slug.trim()) ?? []; + } else { + slugs = fs + .readdirSync(sourceRoot, { withFileTypes: true }) + .filter((entry) => entry.isDirectory()) + .map((entry) => entry.name); + } + + return { sourceRoot, outputRoot, slugs }; +} + +function loadOriginalHtml(tempDir: string, chapter: number): string | null { + const directPath = path.join(tempDir, `original-paragraphs-for-chapter-${chapter}.xml`); + if (fs.existsSync(directPath)) { + return fs.readFileSync(directPath, "utf-8"); + } + + const prefix = `original-paragraphs-for-chapter-${chapter}-chunk-`; + const chunkFiles = fs + .readdirSync(tempDir) + .filter((file) => file.startsWith(prefix) && file.endsWith(".xml")) + .map((file) => ({ file, index: Number(file.slice(prefix.length).replace(/\.xml$/, "")) })) + .filter((entry) => Number.isFinite(entry.index)) + .sort((a, b) => a.index - b.index); + + if (chunkFiles.length === 0) return null; + + return chunkFiles + .map((entry) => fs.readFileSync(path.join(tempDir, entry.file), "utf-8")) + .join("\n"); +} + +function ensureDir(dir: string) { + fs.mkdirSync(dir, { recursive: true }); +} + +function main() { + const { sourceRoot, outputRoot, slugs } = parseArgs(); + + let totalFiles = 0; + let changedFiles = 0; + + for (const slug of slugs) { + const tempDir = path.join(sourceRoot, slug, "temporary-output"); + if (!fs.existsSync(tempDir)) { + console.warn(`Skipping ${slug}: temporary-output not found`); + continue; + } + + const outputTempDir = path.join(outputRoot, slug, "temporary-output"); + ensureDir(outputTempDir); + + const rewrittenFiles = fs + .readdirSync(tempDir) + .filter((file) => /^rewritten-paragraphs-for-chapter-\d+\.xml$/.test(file)); + + if (rewrittenFiles.length === 0) { + console.warn(`Skipping ${slug}: no rewritten-paragraphs files`); + continue; + } + + let slugChanged = 0; + + for (const file of rewrittenFiles) { + totalFiles += 1; + const chapter = Number(file.match(/(\d+)/)?.[1]); + if (!Number.isFinite(chapter)) { + continue; + } + + const modelRaw = fs.readFileSync(path.join(tempDir, file), "utf-8"); + const originalRaw = loadOriginalHtml(tempDir, chapter); + if (!originalRaw) { + console.warn(`Skipping ${slug} chapter ${chapter}: original paragraphs not found`); + continue; + } + + const modelExtract = extractSectionInner(modelRaw); + const originalExtract = extractSectionInner(originalRaw); + + const fixedInner = restoreUnwrappedBlocks(originalExtract.inner, modelExtract.inner); + const hasChanges = fixedInner !== modelExtract.inner; + + const output = hasChanges ? buildSectionWrapper(fixedInner, modelExtract.wrapper) : modelRaw; + + fs.writeFileSync(path.join(outputTempDir, file), output, "utf-8"); + + if (hasChanges) { + slugChanged += 1; + changedFiles += 1; + } + } + + console.log( + `Processed ${slug}: ${rewrittenFiles.length} chapter(s), changed ${slugChanged}. Output: ${outputTempDir}`, + ); + } + + console.log(`Done. Processed ${totalFiles} file(s). Changed ${changedFiles}.`); +} + +main(); diff --git a/apps/pipeline/src/scripts/scan-non-html-tags.ts b/apps/pipeline/src/scripts/scan-non-html-tags.ts new file mode 100644 index 00000000..9775f262 --- /dev/null +++ b/apps/pipeline/src/scripts/scan-non-html-tags.ts @@ -0,0 +1,226 @@ +#!/usr/bin/env bun +import fs from "fs"; +import path from "path"; + +type Args = { sourceRoot: string; slugs: string[]; limit: number }; + +const HTML_TAGS = new Set([ + "a", + "abbr", + "address", + "article", + "aside", + "audio", + "b", + "bdi", + "bdo", + "blockquote", + "br", + "button", + "canvas", + "caption", + "cite", + "code", + "col", + "colgroup", + "data", + "datalist", + "dd", + "del", + "details", + "dfn", + "dialog", + "div", + "dl", + "dt", + "em", + "fieldset", + "figcaption", + "figure", + "footer", + "form", + "h1", + "h2", + "h3", + "h4", + "h5", + "h6", + "head", + "header", + "hr", + "html", + "i", + "img", + "input", + "ins", + "kbd", + "label", + "legend", + "li", + "link", + "main", + "map", + "mark", + "meta", + "meter", + "nav", + "noscript", + "object", + "ol", + "optgroup", + "option", + "output", + "p", + "param", + "picture", + "pre", + "progress", + "q", + "rp", + "rt", + "ruby", + "s", + "samp", + "script", + "section", + "select", + "small", + "source", + "span", + "strong", + "style", + "sub", + "summary", + "sup", + "table", + "tbody", + "td", + "template", + "textarea", + "tfoot", + "th", + "thead", + "time", + "title", + "tr", + "track", + "u", + "ul", + "var", + "video", + "wbr", +]); + +function resolvePath(inputPath: string): string { + if (inputPath.startsWith("~/")) { + return path.join(process.env.HOME ?? "", inputPath.slice(2)); + } + return path.resolve(inputPath); +} + +function parseArgs(): Args { + const args = process.argv.slice(2); + const sourceIdx = args.indexOf("--source"); + const slugsIdx = args.indexOf("--slugs"); + const limitIdx = args.indexOf("--limit"); + + const repoRoot = path.resolve(process.cwd()); + const defaultSource = path.join(repoRoot, "ConvexAssets", "books"); + + const sourceRoot = resolvePath(sourceIdx !== -1 ? args[sourceIdx + 1] : defaultSource); + const limit = limitIdx !== -1 ? Number(args[limitIdx + 1]) : 5; + + let slugs: string[] = []; + if (slugsIdx !== -1) { + slugs = args[slugsIdx + 1]?.split(",").map((slug) => slug.trim()) ?? []; + } else { + slugs = fs + .readdirSync(sourceRoot, { withFileTypes: true }) + .filter((entry) => entry.isDirectory()) + .map((entry) => entry.name); + } + + return { sourceRoot, slugs, limit }; +} + +function listHtmlFiles(dir: string): string[] { + if (!fs.existsSync(dir)) return []; + return fs + .readdirSync(dir) + .filter((file) => file.toLowerCase().endsWith(".html")) + .map((file) => path.join(dir, file)); +} + +function normalizeSnippet(text: string): string { + return text.replace(/\s+/g, " ").trim(); +} + +function main() { + const { sourceRoot, slugs, limit } = parseArgs(); + const regex = /<\s*([A-Za-z][A-Za-z0-9-]*)\b/g; + const ignoredTags = new Set(["hgroup"]); + + const missing: string[] = []; + const tagCounts = new Map(); + const tagSamples = new Map>(); + + let totalFiles = 0; + let scannedSlugs = 0; + + for (const slug of slugs) { + const chaptersDir = path.join(sourceRoot, slug, "chapters-source"); + const files = listHtmlFiles(chaptersDir); + if (files.length === 0) { + missing.push(slug); + continue; + } + + scannedSlugs += 1; + totalFiles += files.length; + + for (const file of files) { + const text = fs.readFileSync(file, "utf-8"); + let match: RegExpExecArray | null; + while ((match = regex.exec(text))) { + const tag = match[1].toLowerCase(); + if (HTML_TAGS.has(tag) || ignoredTags.has(tag)) continue; + + tagCounts.set(tag, (tagCounts.get(tag) ?? 0) + 1); + + const samples = tagSamples.get(tag) ?? []; + if (samples.length < limit) { + const start = Math.max(0, match.index - 120); + const end = Math.min(text.length, match.index + 200); + samples.push({ file, snippet: normalizeSnippet(text.slice(start, end)) }); + tagSamples.set(tag, samples); + } + } + } + } + + console.log(`Scanned ${totalFiles} file(s) across ${scannedSlugs} slug(s).`); + if (missing.length > 0) { + console.log(`Skipped ${missing.length} slug(s) without chapters-source:`); + console.log(missing.join(", ")); + } + + if (tagCounts.size === 0) { + console.log("No non-HTML tags found."); + return; + } + + const sortedTags = Array.from(tagCounts.entries()).sort((a, b) => a[0].localeCompare(b[0])); + console.log(`Found ${sortedTags.length} non-HTML tag(s):`); + for (const [tag, count] of sortedTags) { + console.log(`- <${tag}> (${count} match(es))`); + const samples = tagSamples.get(tag) ?? []; + for (const sample of samples) { + console.log(` ${sample.file}`); + console.log(` ${sample.snippet}`); + } + } +} + +if (require.main === module) { + main(); +} diff --git a/apps/pipeline/src/scripts/upload-chapters-source.spec.ts b/apps/pipeline/src/scripts/upload-chapters-source.spec.ts new file mode 100644 index 00000000..47e8dbe3 --- /dev/null +++ b/apps/pipeline/src/scripts/upload-chapters-source.spec.ts @@ -0,0 +1,14 @@ +import { expect, test } from "vitest"; +import { mapFilenameToBasename } from "./upload-chapters-source"; + +test("mapFilenameToBasename for the rewritten-xmls", () => { + const file = "rewritten-paragraphs-for-chapter-1.xml"; + const basename = mapFilenameToBasename(file); + expect(basename).toBe("chapter-1.html"); +}); + +test("mapFilenameToBasename for the chapter-N.html files", () => { + const file = "chapter-1.html"; + const basename = mapFilenameToBasename(file); + expect(basename).toBe("chapter-1.html"); +}); diff --git a/apps/pipeline/src/scripts/upload-chapters-source.ts b/apps/pipeline/src/scripts/upload-chapters-source.ts index 93d9440b..ec067ada 100644 --- a/apps/pipeline/src/scripts/upload-chapters-source.ts +++ b/apps/pipeline/src/scripts/upload-chapters-source.ts @@ -3,6 +3,9 @@ import path from "path"; import { convex } from "../server/convex-client"; import { AdminConvexHttpClient } from "../lib/AdminConvexHttpClient"; import { api } from "@bookgenius/convex/_generated/api"; +import { getChapterTitle } from "src/tools/new-tooling/get-chapter-title"; +import { DOMParser, type Element as XMLElement } from "@xmldom/xmldom"; +import { computeParagraphCount } from "../lib/paragraphCount"; type Args = { bookSlug: string; @@ -49,7 +52,20 @@ function detectContentType(filePath: string): string { } function listHtmlFiles(inputDir: string): string[] { - return fs.readdirSync(inputDir).filter((file) => file.toLowerCase().endsWith(".html")); + const htmlFiles = fs.readdirSync(inputDir).filter((file) => file.toLowerCase().endsWith(".html")); + if (htmlFiles.length >= 1) { + return htmlFiles; + } else { + const xmlFiles = fs + .readdirSync(inputDir) + .filter((f) => f.match(/^rewritten-paragraphs-for-chapter-\d+\.xml$/)); + if (xmlFiles.length >= 1) { + return xmlFiles; + } else { + console.error(`No .html or .xml files found in ${inputDir}`); + process.exit(1); + } + } } async function main() { @@ -74,7 +90,7 @@ async function main() { const files = stat.isDirectory() ? listHtmlFiles(inputPath).map((file) => ({ source: path.join(inputPath, file), - basename: file, + basename: mapFilenameToBasename(file), })) : [{ source: inputPath, basename: basename ?? path.basename(inputPath) }]; @@ -114,6 +130,7 @@ async function main() { } const content = fs.readFileSync(file.source); + const paragraphCount = computeParagraphCount(content.toString("utf-8")); try { await convex.uploadFile({ folderPath, @@ -121,6 +138,16 @@ async function main() { content, contentType: detectContentType(file.source), }); + const bookPath = `books/${bookSlug}`; + await convex.updateChapterMetadata({ + bookPath, + folderPath: `${bookPath}/chapters-source`, + basename: file.basename, + chapterNumber: parseInt(file.basename.split("-")[1], 10), + title: getChapterTitle(parseChapterIntoDom(content.toString("utf-8"))), + paragraphCount, + sourceFormat: "html", + }); console.log(`Uploaded ${file.basename}`); stats.uploaded += 1; } catch (error) { @@ -134,7 +161,30 @@ async function main() { ); } -main().catch((error) => { - console.error("Fatal error:", error); - process.exit(1); -}); +function parseChapterIntoDom(chapter: string): XMLElement { + const parser = new DOMParser(); + const doc = parser.parseFromString(chapter, "text/html"); + const root = doc.documentElement as XMLElement; + return root; +} + +export function mapFilenameToBasename(filename: string): string { + const match = filename.match(/^rewritten-paragraphs-for-chapter-(\d+)\.xml$/); + if (match) { + return `chapter-${match[1]}.html`; + } else { + const match = filename.match(/^chapter-(\d+)\.html$/); + if (match) { + return `chapter-${match[1]}.html`; + } else { + throw new Error(`Invalid filename: ${filename}`); + } + } +} + +if (require.main === module) { + main().catch((error) => { + console.error("Fatal error:", error); + process.exit(1); + }); +} diff --git a/apps/pipeline/src/scripts/upload-fixed-nonplays.ts b/apps/pipeline/src/scripts/upload-fixed-nonplays.ts new file mode 100644 index 00000000..a4a1f74d --- /dev/null +++ b/apps/pipeline/src/scripts/upload-fixed-nonplays.ts @@ -0,0 +1,188 @@ +#!/usr/bin/env bun +import fs from "fs"; +import path from "path"; +import { convex } from "../server/convex-client"; +import { AdminConvexHttpClient } from "../lib/AdminConvexHttpClient"; +import { api } from "@bookgenius/convex/_generated/api"; +import { getChapterTitle } from "src/tools/new-tooling/get-chapter-title"; +import { DOMParser, type Element as XMLElement } from "@xmldom/xmldom"; +import { computeParagraphCount } from "../lib/paragraphCount"; +import { mapFilenameToBasename } from "./upload-chapters-source"; + +type Args = { sourceRoot: string; slugs: string[] | null; dryRun: boolean; allowNew: boolean }; + +const PLAY_SLUGS = new Set([ + "Hamlet", + "Macbeth", + "Midsummer-Nights-Dream", + "Othello", + "Romeo-And-Juliet", + "The-Tempest", + "Romeo-And-Juliet-Small", + "Romeo-And-Juliet-Smaller", +]); + +function resolvePath(inputPath: string): string { + if (inputPath.startsWith("~/")) { + return path.join(process.env.HOME ?? "", inputPath.slice(2)); + } + return path.resolve(inputPath); +} + +function parseArgs(): Args { + const args = process.argv.slice(2); + const sourceIdx = args.indexOf("--source"); + const slugsIdx = args.indexOf("--slugs"); + + const defaultSource = path.join(process.env.TMPDIR ?? "/tmp", "bookgenius-fixed-nonplays"); + + const slugs = + slugsIdx !== -1 ? (args[slugsIdx + 1]?.split(",").map((s) => s.trim()) ?? []) : null; + + return { + sourceRoot: resolvePath(sourceIdx !== -1 ? args[sourceIdx + 1] : defaultSource), + slugs, + dryRun: args.includes("--dry-run"), + allowNew: args.includes("--allow-new"), + }; +} + +function listSlugs(root: string): string[] { + if (!fs.existsSync(root)) return []; + return fs + .readdirSync(root, { withFileTypes: true }) + .filter((entry) => entry.isDirectory()) + .map((entry) => entry.name); +} + +function listHtmlFiles(inputDir: string): string[] { + const htmlFiles = fs.readdirSync(inputDir).filter((file) => file.toLowerCase().endsWith(".html")); + if (htmlFiles.length >= 1) { + return htmlFiles; + } + console.error(`No .html files found in ${inputDir}`); + process.exit(1); +} + +function detectContentType(filePath: string): string { + return filePath.toLowerCase().endsWith(".html") ? "text/html" : "application/octet-stream"; +} + +function parseChapterIntoDom(chapter: string): XMLElement { + const parser = new DOMParser(); + const doc = parser.parseFromString(chapter, "text/html"); + const root = doc.documentElement as XMLElement; + return root; +} + +async function uploadSlug( + slug: string, + inputDir: string, + adminClient: AdminConvexHttpClient, + dryRun: boolean, + allowNew: boolean, +): Promise<{ uploaded: number; skipped: number; missing: number; total: number }> { + const folderPath = `books/${slug}/chapters-source`; + const stats = { uploaded: 0, skipped: 0, missing: 0, total: 0 }; + + const files = listHtmlFiles(inputDir).map((file) => ({ + source: path.join(inputDir, file), + basename: mapFilenameToBasename(file), + })); + + for (const file of files) { + stats.total += 1; + + if (!allowNew) { + const existing = await adminClient.query(api.cli.getAsset, { + folderPath, + basename: file.basename, + }); + if (!existing) { + console.error(`Missing asset in Convex: ${folderPath}/${file.basename}`); + stats.missing += 1; + continue; + } + } + + if (dryRun) { + console.log(`[dry-run] Would upload ${file.source} -> ${folderPath}/${file.basename}`); + stats.skipped += 1; + continue; + } + + const content = fs.readFileSync(file.source); + const paragraphCount = computeParagraphCount(content.toString("utf-8")); + try { + await convex.uploadFile({ + folderPath, + basename: file.basename, + content, + contentType: detectContentType(file.source), + }); + const bookPath = `books/${slug}`; + await convex.updateChapterMetadata({ + bookPath, + folderPath: `${bookPath}/chapters-source`, + basename: file.basename, + chapterNumber: parseInt(file.basename.split("-")[1], 10), + title: getChapterTitle(parseChapterIntoDom(content.toString("utf-8"))), + paragraphCount, + sourceFormat: "html", + }); + console.log(`Uploaded ${slug}/${file.basename}`); + stats.uploaded += 1; + } catch (error) { + const message = error instanceof Error ? error.message : String(error); + console.error(`Failed to upload ${slug}/${file.basename}: ${message}`); + } + } + + return stats; +} + +async function main() { + const { sourceRoot, slugs, dryRun, allowNew } = parseArgs(); + const convexUrl = process.env.CONVEX_URL || process.env.NEXT_PUBLIC_CONVEX_URL; + + if (!convexUrl) { + console.error("Missing CONVEX_URL environment variable"); + process.exit(1); + } + + if (!fs.existsSync(sourceRoot)) { + console.error(`Source root not found: ${sourceRoot}`); + process.exit(1); + } + + const adminClient = new AdminConvexHttpClient(convexUrl); + const targetSlugs = slugs ?? listSlugs(sourceRoot); + const totals = { uploaded: 0, skipped: 0, missing: 0, total: 0 }; + + for (const slug of targetSlugs) { + if (PLAY_SLUGS.has(slug)) continue; + + const inputDir = path.join(sourceRoot, slug, "chapters-source"); + if (!fs.existsSync(inputDir)) { + console.warn(`Skipping ${slug}: missing ${inputDir}`); + continue; + } + + const stats = await uploadSlug(slug, inputDir, adminClient, dryRun, allowNew); + totals.uploaded += stats.uploaded; + totals.skipped += stats.skipped; + totals.missing += stats.missing; + totals.total += stats.total; + } + + console.log( + `Done. Uploaded: ${totals.uploaded}, skipped: ${totals.skipped}, missing: ${totals.missing}, total: ${totals.total}`, + ); +} + +if (require.main === module) { + main().catch((error) => { + console.error("Fatal error:", error); + process.exit(1); + }); +} diff --git a/apps/pipeline/src/server/backfill-paragraph-counts.ts b/apps/pipeline/src/server/backfill-paragraph-counts.ts new file mode 100644 index 00000000..ffecba55 --- /dev/null +++ b/apps/pipeline/src/server/backfill-paragraph-counts.ts @@ -0,0 +1,168 @@ +#!/usr/bin/env bun +/** + * Backfill paragraphCount for chapters-source metadata using player indexing rules. + * + * Usage: + * bun apps/pipeline/src/server/backfill-paragraph-counts.ts [--dry-run] [--limit N] + * bun apps/pipeline/src/server/backfill-paragraph-counts.ts --all [--dry-run] [--limit N] + */ + +import "dotenv/config"; +import { AdminConvexHttpClient } from "../lib/AdminConvexHttpClient"; +import { api } from "@bookgenius/convex/_generated/api"; +import { computeParagraphCount } from "../lib/paragraphCount"; + +type Args = { bookSlug?: string; all: boolean; dryRun: boolean; limit?: number }; + +function parseArgs(): Args { + const args = process.argv.slice(2); + const all = args.includes("--all"); + const dryRun = args.includes("--dry-run"); + const limitIdx = args.indexOf("--limit"); + const limit = limitIdx !== -1 ? Number(args[limitIdx + 1]) : undefined; + let bookSlug: string | undefined; + for (let i = 0; i < args.length; i += 1) { + const arg = args[i]; + if (arg === "--limit") { + i += 1; + continue; + } + if (!arg.startsWith("--") && !bookSlug) { + bookSlug = arg; + } + } + + if (!all && !bookSlug) { + console.error( + "Usage: bun apps/pipeline/src/server/backfill-paragraph-counts.ts [--dry-run] [--limit N]", + ); + console.error( + " or: bun apps/pipeline/src/server/backfill-paragraph-counts.ts --all [--dry-run] [--limit N]", + ); + process.exit(1); + } + + return { bookSlug, all, dryRun, limit: Number.isFinite(limit) ? limit : undefined }; +} + +async function listBookPaths(client: AdminConvexHttpClient): Promise { + const books = await client.query(api.bookQueries.listBooks, {}); + return books.map((b) => b.path); +} + +async function backfillBook( + client: AdminConvexHttpClient, + bookPath: string, + options: { dryRun: boolean; limit?: number }, +): Promise<{ scanned: number; updated: number; skipped: number; failed: number }> { + const chapters = await client.query(api.bookQueries.listHtmlSourceChapters, { bookPath }); + + if (!chapters || chapters.length === 0) { + console.log(`[backfill] ${bookPath}: no chapters-source files found`); + return { scanned: 0, updated: 0, skipped: 0, failed: 0 }; + } + + let scanned = 0; + let updated = 0; + let skipped = 0; + let failed = 0; + + for (const chapter of chapters) { + scanned += 1; + const existingCount = chapter.paragraphCount ?? 0; + if (existingCount > 0) { + skipped += 1; + continue; + } + + if (options.limit !== undefined && updated >= options.limit) { + break; + } + + const result = await client.action(api.cli.getTextContent, { versionId: chapter.versionId }); + + const html = result?.content ?? ""; + if (!html) { + console.warn(`[backfill] ${bookPath}/${chapter.basename}: empty content`); + failed += 1; + continue; + } + + const paragraphCount = computeParagraphCount(html); + + if (options.dryRun) { + console.log(`[dry-run] ${bookPath}/${chapter.basename} -> paragraphCount=${paragraphCount}`); + updated += 1; + continue; + } + + try { + await client.mutation(api.metadata.updateChapterMetadata, { + bookPath, + folderPath: `${bookPath}/chapters-source`, + basename: chapter.basename, + chapterNumber: chapter.chapterNumber, + paragraphCount, + sourceFormat: chapter.sourceFormat ?? "html", + }); + updated += 1; + console.log( + `[backfill] ${bookPath}/${chapter.basename} set paragraphCount=${paragraphCount}`, + ); + } catch (error) { + const message = error instanceof Error ? error.message : String(error); + console.warn(`[backfill] ${bookPath}/${chapter.basename} failed to update: ${message}`); + failed += 1; + } + } + + return { scanned, updated, skipped, failed }; +} + +async function main() { + const { bookSlug, all, dryRun, limit } = parseArgs(); + const convexUrl = process.env.CONVEX_URL || process.env.NEXT_PUBLIC_CONVEX_URL; + + if (!convexUrl) { + console.error("Missing CONVEX_URL environment variable"); + process.exit(1); + } + + const client = new AdminConvexHttpClient(convexUrl); + const bookPaths = all + ? await listBookPaths(client) + : [`books/${bookSlug?.replace(/^books\//, "")}`]; + + let totalScanned = 0; + let totalUpdated = 0; + let totalSkipped = 0; + let totalFailed = 0; + + let remainingLimit = limit; + + for (const bookPath of bookPaths) { + console.log(`\n[backfill] Processing ${bookPath}...`); + const result = await backfillBook(client, bookPath, { dryRun, limit: remainingLimit }); + totalScanned += result.scanned; + totalUpdated += result.updated; + totalSkipped += result.skipped; + totalFailed += result.failed; + + if (remainingLimit !== undefined) { + remainingLimit -= result.updated; + if (remainingLimit <= 0) { + break; + } + } + } + + console.log( + `\n[backfill] Done. scanned=${totalScanned} updated=${totalUpdated} skipped=${totalSkipped} failed=${totalFailed}`, + ); +} + +main().catch((error) => { + const message = error instanceof Error ? error.message : String(error); + console.error(`[backfill] Fatal error: ${message}`); + process.exit(1); +}); diff --git a/apps/pipeline/src/server/clone-book.ts b/apps/pipeline/src/server/clone-book.ts index 34a1e3a1..37e76d0b 100644 --- a/apps/pipeline/src/server/clone-book.ts +++ b/apps/pipeline/src/server/clone-book.ts @@ -18,6 +18,7 @@ import { convex, getCharacterFolders, getChapterXml } from "./convex-client"; import { uploadBookFolder } from "./upload-books-to-r2"; import { AdminConvexHttpClient } from "../lib/AdminConvexHttpClient"; import { api } from "@bookgenius/convex/_generated/api"; +import { computeParagraphCount } from "../lib/paragraphCount"; import "dotenv/config"; import { v4 as uuidv4 } from "uuid"; @@ -182,6 +183,7 @@ async function cloneBook(sourceSlug: string, targetSlug: string): Promise { const repoRoot = path.resolve(__dirname, "../../"); @@ -45,6 +46,7 @@ async function fixChaptersUpload(bookSlug: string): Promise { const chapterNumber = parseInt(match[1], 10); const filePath = path.join(tempOutput, file); const content = await fs.readFile(filePath, "utf-8"); + const paragraphCount = computeParagraphCount(content); const basename = `chapter-${chapterNumber}.html`; console.log(` Uploading chapter ${chapterNumber}...`); @@ -62,6 +64,7 @@ async function fixChaptersUpload(bookSlug: string): Promise { folderPath: `${bookPath}/chapters-source`, basename, chapterNumber, + paragraphCount, sourceFormat: "html", }); diff --git a/apps/pipeline/src/server/pipeline.ts b/apps/pipeline/src/server/pipeline.ts index dfaee5dd..ed1a7cbc 100644 --- a/apps/pipeline/src/server/pipeline.ts +++ b/apps/pipeline/src/server/pipeline.ts @@ -32,6 +32,7 @@ import { } from "../../src/tools/new-tooling/create-graphical-style"; import { getBookSettings } from "../../src/helpers/getBookSettings"; import { generateTagName } from "../../src/helpers/generateTagName"; +import { computeParagraphCount } from "../lib/paragraphCount"; import { initProgress, markStepStarted, @@ -204,6 +205,7 @@ async function uploadChaptersToConvex(job: Job, tempOutputDir: string) { const filePath = path.join(tempOutputDir, file); const content = fs.readFileSync(filePath); const basename = `chapter-${chapterNumber}.html`; + const paragraphCount = computeParagraphCount(content.toString("utf-8")); addLog(job, `Uploading chapter ${chapterNumber} to Convex...`); @@ -220,6 +222,7 @@ async function uploadChaptersToConvex(job: Job, tempOutputDir: string) { basename, chapterNumber, title: `Chapter ${chapterNumber}`, + paragraphCount, sourceFormat: "html", }); addLog(job, `✔ Chapter ${chapterNumber} uploaded`); diff --git a/apps/pipeline/src/server/regenerate-missing-avatars.ts b/apps/pipeline/src/server/regenerate-missing-avatars.ts index 799b2caa..11207add 100644 --- a/apps/pipeline/src/server/regenerate-missing-avatars.ts +++ b/apps/pipeline/src/server/regenerate-missing-avatars.ts @@ -161,6 +161,60 @@ async function generateSingleAvatar( } } +/** + * Generate a generic avatar for unknown/minor characters. + * This avatar matches the book's art style but shows a mysterious silhouette + * that can be used for any speaker not in the character list. + */ +async function generateGenericAvatar(bookPath: string, avatarStyle: string): Promise { + const genericPath = `${bookPath}/characters/generic`; + + // Check if generic avatar already exists + try { + const files = await getPublishedFilesInFolder(genericPath); + const hasAvatar = files.some((f) => f.basename === "avatar-large.png"); + if (hasAvatar) { + console.log("✅ Generic avatar already exists, skipping"); + return; + } + } catch { + // Folder doesn't exist yet, that's fine + } + + console.log("📷 Generating generic avatar for unknown characters..."); + + const genericPrompt = `A mysterious figure shown from behind or in silhouette. +No distinct facial features visible. The figure should feel enigmatic and anonymous, +suitable for representing any unnamed or minor character. +Atmospheric lighting with the figure partially obscured by shadow or mist.`; + + try { + const generator = + process.env.FREE_RUN === "true" + ? generateCharacterImageWithFlux + : generateCharacterImageWithOpenAI; + const imageBuffer = await generator(genericPrompt, "Unknown Character", avatarStyle); + + if (!imageBuffer) { + console.error("❌ Failed to generate generic avatar"); + return; + } + + console.log("📤 Uploading generic avatar..."); + await convex.uploadFile({ + folderPath: genericPath, + basename: "avatar-large.png", + content: imageBuffer, + contentType: "image/png", + }); + + console.log("✅ Successfully generated and uploaded generic avatar"); + } catch (e) { + const errorMsg = e instanceof Error ? e.message : String(e); + console.error("❌ Error generating generic avatar:", errorMsg); + } +} + async function regenerateMissingAvatars(bookPath: string, avatarStyle: string): Promise { const { missingLarge } = await findCharactersMissingAvatars(bookPath); @@ -246,6 +300,9 @@ async function main() { } await regenerateMissingAvatars(bookPath, styleData.avatarStyle); + + // Generate a generic avatar for unknown/minor characters + await generateGenericAvatar(bookPath, styleData.avatarStyle); } main().catch((e) => { diff --git a/apps/pipeline/src/server/upload-chapters.ts b/apps/pipeline/src/server/upload-chapters.ts deleted file mode 100644 index 7e02bd81..00000000 --- a/apps/pipeline/src/server/upload-chapters.ts +++ /dev/null @@ -1,69 +0,0 @@ -import fs from "fs"; -import path from "path"; -import { convex } from "./convex-client"; - -async function uploadChapters(bookSlug: string) { - const repoRoot = path.resolve(__dirname, "../../"); - const bookRoot = path.join(repoRoot, "books-data", bookSlug); - const tempOutputDir = path.join(bookRoot, "temporary-output"); - const bookPath = `books/${bookSlug}`; - - if (!fs.existsSync(tempOutputDir)) { - console.error(`Directory not found: ${tempOutputDir}`); - process.exit(1); - } - - const files = fs - .readdirSync(tempOutputDir) - .filter((f) => f.match(/^rewritten-paragraphs-for-chapter-\d+\.xml$/)); - - if (files.length === 0) { - console.log("No chapter files found to upload"); - return; - } - - console.log(`Found ${files.length} chapters to upload`); - - for (const file of files) { - const match = file.match(/chapter-(\d+)/); - if (!match) continue; - - const chapterNumber = parseInt(match[1], 10); - const filePath = path.join(tempOutputDir, file); - const content = fs.readFileSync(filePath); - const basename = `chapter-${chapterNumber}.html`; - - console.log(`Uploading chapter ${chapterNumber}...`); - - try { - await convex.uploadFile({ - folderPath: `${bookPath}/chapters-source`, - basename, - content, - contentType: "text/html", - }); - await convex.updateChapterMetadata({ - bookPath, - folderPath: `${bookPath}/chapters-source`, - basename, - chapterNumber, - title: `Chapter ${chapterNumber}`, - sourceFormat: "html", - }); - console.log(`✔ Chapter ${chapterNumber} uploaded`); - } catch (e) { - const msg = e instanceof Error ? e.message : String(e); - console.error(`✖ Failed to upload chapter ${chapterNumber}: ${msg}`); - } - } - - console.log("Done!"); -} - -const slug = process.argv[2]; -if (!slug) { - console.error("Usage: tsx upload-chapters.ts "); - process.exit(1); -} - -uploadChapters(slug); diff --git a/apps/pipeline/src/tools/NewRewriteParagraphsPromptBook.md b/apps/pipeline/src/tools/NewRewriteParagraphsPromptBook.md index 25e4842c..d9e549e9 100644 --- a/apps/pipeline/src/tools/NewRewriteParagraphsPromptBook.md +++ b/apps/pipeline/src/tools/NewRewriteParagraphsPromptBook.md @@ -34,6 +34,24 @@ Identify mentions of the characters within the text. - **Flexibility:** Match names even if they appear in different grammatical cases (e.g., Polish declensions like "Winstona", "Winstonowi") or possessives (English "Winston's") or when referenced by title ("General") - but only if its a clear reference to the character. - **Structure:** `Mentioned Name` +## 3. Unknown Character Speakers + +When dialogue is spoken by a character **NOT in the Characters List**: + +- **Tag their SPEECH ONLY** - add `data-speaker` attribute to the paragraph +- **DO NOT tag their mentions** - no `data-c` spans for unknown characters +- **Generate a descriptive slug** based on how the text refers to them or their observable traits + +### Slug Guidelines for Unknown Characters: + +- Keep descriptions concise but uniquely identifying (2-5 words) +- Use observable traits: role, appearance, location, action +- Be specific enough to differentiate similar characters (e.g., two soldiers → `tall-soldier-at-gate` vs `wounded-soldier`) + +**Good Examples:** `tall-soldier-at-gate`, `old-woman-selling-bread`, `gruff-innkeeper`, `the-nurse` + +**Bad Examples:** `person` (too generic), `speaker` (not descriptive), `character-1` (meaningless), `soldier` (too generic) + # Constraints (CRITICAL) 1. **Text Invariance:** The visible text inside the tags must remain **EXACTLY** the same as the input. Do not fix grammar, do not correct spelling, do not remove archaic words. @@ -63,8 +81,13 @@ Identify mentions of the characters within the text. **Output HTML:** ```html -

Książę spojrzał na Sarę, a jego wzrok złagodniał.

-

— Panie mój — wyszeptała Sara — twe słowa są jak światło.

+

+ Książę spojrzał na Sarę, a jego + wzrok złagodniał. +

+

+ — Panie mój — wyszeptała Sara — twe słowa są jak światło. +

``` ## Example 2: English (Quotes & Formatting) @@ -92,7 +115,9 @@ Identify mentions of the characters within the text. 'But they were in the well,' Alice said to the Dormouse, ignoring the remark.

-

'Of course they were', said the Dormouse; '—well in.'

+

+ 'Of course they were', said the Dormouse; '—well in.' +

``` ## Example 3: Multiple Speakers (Edge Case) @@ -121,6 +146,30 @@ Identify mentions of the characters within the text.

``` +## Example 4: Unknown Character Speaker + +**Characters (JSON):** + +```json +[{ "id": "winston", "name": "Winston", "desc": "Protagonist" }] +``` + +**Input HTML:** + +```html +

"Stand back!" shouted the tall soldier at the gate.

+

Winston obeyed silently.

+``` + +**Output HTML:** + +```html +

"Stand back!" shouted the tall soldier at the gate.

+

Winston obeyed silently.

+``` + +Note: The soldier gets `data-speaker` with a descriptive slug, but is NOT wrapped in `data-c` because they're not in the Characters List. + --- ## Important reminder diff --git a/apps/pipeline/src/tools/NewRewriteParagraphsPromptBookChunked.md b/apps/pipeline/src/tools/NewRewriteParagraphsPromptBookChunked.md index bea6bbe8..d995eb3e 100644 --- a/apps/pipeline/src/tools/NewRewriteParagraphsPromptBookChunked.md +++ b/apps/pipeline/src/tools/NewRewriteParagraphsPromptBookChunked.md @@ -34,6 +34,24 @@ Identify mentions of the characters within the text. - **Flexibility:** Match names even if they appear in different grammatical cases (e.g., Polish declensions like "Winstona", "Winstonowi") or possessives (English "Winston's") or when referenced by title ("General") - but only if its a clear reference to the character. - **Structure:** `Mentioned Name` +## 3. Unknown Character Speakers + +When dialogue is spoken by a character **NOT in the Characters List**: + +- **Tag their SPEECH ONLY** - add `data-speaker` attribute to the paragraph +- **DO NOT tag their mentions** - no `data-c` spans for unknown characters +- **Generate a descriptive slug** based on how the text refers to them or their observable traits + +### Slug Guidelines for Unknown Characters: + +- Keep descriptions concise but uniquely identifying (2-5 words) +- Use observable traits: role, appearance, location, action +- Be specific enough to differentiate similar characters (e.g., two soldiers → `tall-soldier-at-gate` vs `wounded-soldier`) + +**Good Examples:** `tall-soldier-at-gate`, `old-woman-selling-bread`, `gruff-innkeeper`, `the-nurse` + +**Bad Examples:** `person` (too generic), `speaker` (not descriptive), `character-1` (meaningless), `soldier` (too generic) + # Constraints (CRITICAL) 1. **Text Invariance:** The visible text inside the tags must remain **EXACTLY** the same as the input. Do not fix grammar, do not correct spelling, do not remove archaic words. @@ -63,8 +81,13 @@ Identify mentions of the characters within the text. **Output HTML:** ```html -

Książę spojrzał na Sarę, a jego wzrok złagodniał.

-

— Panie mój — wyszeptała Sara — twe słowa są jak światło.

+

+ Książę spojrzał na Sarę, a jego + wzrok złagodniał. +

+

+ — Panie mój — wyszeptała Sara — twe słowa są jak światło. +

``` ## Example 2: English (Quotes & Formatting) @@ -92,7 +115,9 @@ Identify mentions of the characters within the text. 'But they were in the well,' Alice said to the Dormouse, ignoring the remark.

-

'Of course they were', said the Dormouse; '—well in.'

+

+ 'Of course they were', said the Dormouse; '—well in.' +

``` ## Example 3: Multiple Speakers (Edge Case) @@ -121,6 +146,30 @@ Identify mentions of the characters within the text.

``` +## Example 4: Unknown Character Speaker + +**Characters (JSON):** + +```json +[{ "id": "winston", "name": "Winston", "desc": "Protagonist" }] +``` + +**Input HTML:** + +```html +

"Stand back!" shouted the tall soldier at the gate.

+

Winston obeyed silently.

+``` + +**Output HTML:** + +```html +

"Stand back!" shouted the tall soldier at the gate.

+

Winston obeyed silently.

+``` + +Note: The soldier gets `data-speaker` with a descriptive slug, but is NOT wrapped in `data-c` because they're not in the Characters List. + --- ## Important reminder diff --git a/apps/pipeline/src/tools/chapterChunker.spec.ts b/apps/pipeline/src/tools/chapterChunker.spec.ts new file mode 100644 index 00000000..3ccf6672 --- /dev/null +++ b/apps/pipeline/src/tools/chapterChunker.spec.ts @@ -0,0 +1,42 @@ +import { describe, it, expect } from "vitest"; +import { buildParagraphXml, type Paragraph } from "./chapterChunker"; + +describe("buildParagraphXml", () => { + it("renders attributes and escapes quotes inside attribute values", () => { + const paragraph: Paragraph = { + elementType: "p", + dataIndex: 1, + text: "Hello world", + attributes: { "data-title": 'A "quoted" title' }, + }; + + const output = buildParagraphXml(paragraph); + + expect(output).toBe('

Hello world

'); + }); + + it("preserves inner HTML tags in the paragraph text", () => { + const paragraph: Paragraph = { + elementType: "p", + dataIndex: 2, + text: 'Hello Mary.', + }; + + const output = buildParagraphXml(paragraph); + + expect(output).toContain("Mary"); + }); + + it("keeps double-quoted attributes inside embedded HTML", () => { + const paragraph: Paragraph = { + elementType: "figure", + dataIndex: 3, + text: 'Mrs. Inglethorp\'s bedroom', + }; + + const output = buildParagraphXml(paragraph); + + expect(output).toContain('alt="Mrs. Inglethorp\'s bedroom"'); + }); +}); diff --git a/apps/pipeline/src/tools/chapterChunker.ts b/apps/pipeline/src/tools/chapterChunker.ts index bd4fadd6..b6fda21b 100644 --- a/apps/pipeline/src/tools/chapterChunker.ts +++ b/apps/pipeline/src/tools/chapterChunker.ts @@ -111,7 +111,7 @@ function buildAttributeString(attributes?: Record): string { export function buildParagraphXml(p: Paragraph): string { const attrs = buildAttributeString(p.attributes); - return `<${p.elementType}${attrs}>${p.text.trim().replace(/"/g, "'")}`; + return `<${p.elementType}${attrs}>${p.text.trim()}`; } export function buildChunkXml(paragraphs: Paragraph[]): string { diff --git a/apps/pipeline/src/tools/fix-legacy-play-custom-tags.spec.ts b/apps/pipeline/src/tools/fix-legacy-play-custom-tags.spec.ts new file mode 100644 index 00000000..460ef7ea --- /dev/null +++ b/apps/pipeline/src/tools/fix-legacy-play-custom-tags.spec.ts @@ -0,0 +1,70 @@ +import { describe, expect, it } from "vitest"; +import { JSDOM } from "jsdom"; +import { fixLegacyPlayCustomTags } from "./fix-legacy-play-custom-tags"; + +function parseSection(html: string): Element { + const dom = new JSDOM(html); + const doc = dom.window.document; + const section = doc.querySelector("section[data-chapter]"); + if (!section) { + throw new Error("Missing section[data-chapter] in test input"); + } + return section; +} + +describe("fixLegacyPlayCustomTags", () => { + it("converts non-HTML tags outside em to spans with data-c", () => { + const input = ` +
+

HAMLET.

+

O dear Ophelia, I am ill at these numbers.

+
+ `; + + const result = fixLegacyPlayCustomTags(input); + const section = parseSection(result); + + const hamlet = section.querySelector('span[data-c="hamlet"]'); + const ophelia = section.querySelector('span[data-c="ophelia"]'); + + expect(hamlet?.textContent).toBe("HAMLET"); + expect(ophelia?.textContent).toBe("Ophelia"); + + expect(section.querySelector("hamlet")).toBeNull(); + expect(section.querySelector("ophelia")).toBeNull(); + }); + + it("preserves custom attributes and maps enters/exits to data-* while dropping raw attrs", () => { + const input = ` +
+

House of Capulet.

+
+ `; + + const result = fixLegacyPlayCustomTags(input); + const section = parseSection(result); + + const capulet = section.querySelector( + 'span[data-c="capulet"][data-enters="true"][dynasty="true"]', + ); + + expect(capulet?.textContent).toBe("Capulet"); + expect(capulet?.hasAttribute("enters")).toBe(false); + expect(capulet?.hasAttribute("exits")).toBe(false); + }); + + it("does not convert valid HTML tags like cite", () => { + const input = ` +
+

Source: Some Book.

+
+ `; + + const result = fixLegacyPlayCustomTags(input); + const section = parseSection(result); + const cite = section.querySelector("cite"); + + expect(cite?.textContent).toBe("Some Book"); + expect(section.querySelector('span[data-c="cite"]')).toBeNull(); + }); +}); diff --git a/apps/pipeline/src/tools/fix-legacy-play-custom-tags.ts b/apps/pipeline/src/tools/fix-legacy-play-custom-tags.ts new file mode 100644 index 00000000..f1bdef9b --- /dev/null +++ b/apps/pipeline/src/tools/fix-legacy-play-custom-tags.ts @@ -0,0 +1,168 @@ +import { JSDOM } from "jsdom"; + +const HTML_TAGS = new Set([ + "a", + "abbr", + "address", + "article", + "aside", + "audio", + "b", + "bdi", + "bdo", + "blockquote", + "body", + "br", + "button", + "canvas", + "caption", + "cite", + "code", + "col", + "colgroup", + "data", + "datalist", + "dd", + "del", + "details", + "dfn", + "dialog", + "div", + "dl", + "dt", + "em", + "fieldset", + "figcaption", + "figure", + "footer", + "form", + "h1", + "h2", + "h3", + "h4", + "h5", + "h6", + "head", + "header", + "hr", + "html", + "i", + "img", + "input", + "ins", + "kbd", + "label", + "legend", + "li", + "link", + "main", + "map", + "mark", + "meta", + "meter", + "nav", + "noscript", + "object", + "ol", + "optgroup", + "option", + "output", + "p", + "param", + "picture", + "pre", + "progress", + "q", + "rp", + "rt", + "ruby", + "s", + "samp", + "script", + "section", + "select", + "small", + "source", + "span", + "strong", + "style", + "sub", + "summary", + "sup", + "table", + "tbody", + "td", + "template", + "textarea", + "tfoot", + "th", + "thead", + "time", + "title", + "tr", + "track", + "u", + "ul", + "var", + "video", + "wbr", +]); + +function slugifyTag(tagName: string): string { + return tagName + .toLowerCase() + .replace(/[^a-z0-9]+/g, "-") + .replace(/^-+|-+$/g, ""); +} + +function copyAttributes(target: Element, source: Element, skip: Set): void { + for (const attr of Array.from(source.attributes)) { + const name = attr.name.toLowerCase(); + if (skip.has(name)) continue; + target.setAttribute(attr.name, attr.value); + } +} + +export function fixLegacyPlayCustomTags(html: string): string { + const dom = new JSDOM(html); + const doc = dom.window.document; + let didFix = false; + + const allElements = Array.from(doc.querySelectorAll("*")); + for (const element of allElements) { + const tagName = element.tagName.toLowerCase(); + if (HTML_TAGS.has(tagName)) continue; + + const slug = slugifyTag(tagName); + if (!slug) continue; + + const span = doc.createElement("span"); + span.setAttribute("data-c", slug); + + const entersValue = element.getAttribute("enters") ?? element.getAttribute("data-enters"); + const exitsValue = element.getAttribute("exits") ?? element.getAttribute("data-exits"); + + copyAttributes( + span, + element, + new Set(["enters", "exits", "data-enters", "data-exits", "talking"]), + ); + + if (entersValue !== null) { + span.setAttribute("data-enters", entersValue || "true"); + } + if (exitsValue !== null) { + span.setAttribute("data-exits", exitsValue || "true"); + } + + const text = (element.textContent || "").trim(); + if (text.length > 0) { + span.textContent = text; + } + + element.replaceWith(span); + didFix = true; + } + + return didFix ? doc.body.innerHTML : html; +} diff --git a/apps/pipeline/src/tools/fix-legacy-play-didaskalia.spec.ts b/apps/pipeline/src/tools/fix-legacy-play-didaskalia.spec.ts new file mode 100644 index 00000000..b3c3b24d --- /dev/null +++ b/apps/pipeline/src/tools/fix-legacy-play-didaskalia.spec.ts @@ -0,0 +1,94 @@ +import { describe, expect, it } from "vitest"; +import { JSDOM } from "jsdom"; +import { fixLegacyPlayDidaskalia } from "./fix-legacy-play-didaskalia"; + +function parseSection(html: string): Element { + const dom = new JSDOM(html); + const doc = dom.window.document; + const section = doc.querySelector("section[data-chapter]"); + if (!section) { + throw new Error("Missing section[data-chapter] in test input"); + } + return section; +} + +describe("fixLegacyPlayDidaskalia", () => { + it("moves didaskalia and narration into current speaker until next speaker block", () => { + const input = ` +
+
+

So well thy words become thee as thy wounds;

+

They smack of honour both. Go get him surgeons.

+
+

Exit Sergeant, attended

+

Who comes here?

+

Enter ROSS

+
+

The worthy thane of Ross.

+
+
+ `; + + const result = fixLegacyPlayDidaskalia(input); + const section = parseSection(result); + const duncan = section.querySelector('div[data-speaker="duncan"]'); + expect(duncan).toBeTruthy(); + const duncanPs = duncan?.querySelectorAll("p") ?? []; + expect(duncanPs.length).toBe(4); + expect(duncanPs[2]?.textContent).toContain("Exit Sergeant"); + expect(duncanPs[3]?.textContent).toContain("Who comes here?"); + + const duncanNext = duncan?.nextElementSibling; + expect(duncanNext?.tagName.toLowerCase()).toBe("p"); + expect(duncanNext?.textContent).toContain("Enter ROSS"); + }); + + it("keeps didaskalia outside when it precedes the next speaker block", () => { + const input = ` +
+
+

So well thy words become thee as thy wounds;

+

They smack of honour both. Go get him surgeons.

+
+

Enter ROSS

+
+

The worthy thane of Ross.

+
+
+ `; + + const result = fixLegacyPlayDidaskalia(input); + const section = parseSection(result); + const duncan = section.querySelector('div[data-speaker="duncan"]'); + expect(duncan).toBeTruthy(); + const duncanPs = duncan?.querySelectorAll("p") ?? []; + expect(duncanPs.length).toBe(2); + + const duncanNext = duncan?.nextElementSibling; + expect(duncanNext?.tagName.toLowerCase()).toBe("p"); + expect(duncanNext?.textContent).toContain("Enter ROSS"); + }); + + it("keeps trailing didaskalia outside when it is the last element", () => { + const input = ` +
+
+

How goes the night, boy?

+
+

Exit

+
+ `; + + const result = fixLegacyPlayDidaskalia(input); + const section = parseSection(result); + const macbeth = section.querySelector('div[data-speaker="macbeth"]'); + expect(macbeth).toBeTruthy(); + const macbethPs = macbeth?.querySelectorAll("p") ?? []; + expect(macbethPs.length).toBe(1); + + const next = macbeth?.nextElementSibling; + expect(next?.tagName.toLowerCase()).toBe("p"); + expect(next?.getAttribute("data-is-didaskalia")).toBe("true"); + expect(next?.textContent).toContain("Exit"); + }); +}); diff --git a/apps/pipeline/src/tools/fix-legacy-play-didaskalia.ts b/apps/pipeline/src/tools/fix-legacy-play-didaskalia.ts new file mode 100644 index 00000000..f64c7a67 --- /dev/null +++ b/apps/pipeline/src/tools/fix-legacy-play-didaskalia.ts @@ -0,0 +1,120 @@ +import { JSDOM } from "jsdom"; + +const BOUNDARY_TAGS = new Set(["h1", "h2", "h3", "h4", "h5", "h6"]); + +function isSpeakerBlock(element: Element): boolean { + if (element.tagName.toLowerCase() !== "div") return false; + if (!element.hasAttribute("data-speaker")) return false; + return ( + element.hasAttribute("data-label") || element.querySelector("[data-speaker-label]") !== null + ); +} + +function hasNonWhitespaceText(nodes: ChildNode[]): boolean { + return nodes.some((node) => node.nodeType === 3 && node.textContent?.trim()); +} + +function onlyElementChild(element: Element): Element | null { + const children = Array.from(element.children); + if (children.length !== 1) return null; + return children[0]; +} + +function isPureEmParagraph(p: Element): boolean { + if (hasNonWhitespaceText(Array.from(p.childNodes))) return false; + + const directChild = onlyElementChild(p); + if (!directChild) return false; + if (directChild.tagName.toLowerCase() === "em") return true; + + if (directChild.tagName.toLowerCase() !== "span") return false; + if (hasNonWhitespaceText(Array.from(directChild.childNodes))) return false; + + const spanChild = onlyElementChild(directChild); + return spanChild?.tagName.toLowerCase() === "em"; +} + +function isDidaskaliaParagraph(element: Element): boolean { + if (element.tagName.toLowerCase() !== "p") return false; + if (element.getAttribute("data-is-didaskalia") === "true") return true; + return isPureEmParagraph(element); +} + +function isBoundaryElement(element: Element): boolean { + const tagName = element.tagName.toLowerCase(); + return BOUNDARY_TAGS.has(tagName) || tagName === "section"; +} + +function findNextNonDidaskaliaSibling(start: Element): Element | null { + let next = start.nextElementSibling; + while (next && isDidaskaliaParagraph(next)) { + next = next.nextElementSibling; + } + return next; +} + +function isMovableParagraph(element: Element): boolean { + return element.tagName.toLowerCase() === "p" && !element.hasAttribute("data-speaker"); +} + +function fixSection(section: Element): boolean { + let didFix = false; + let node: Element | null = section.firstElementChild; + + while (node) { + if (isSpeakerBlock(node)) { + let cursor = node.nextElementSibling; + + while (cursor) { + if (isSpeakerBlock(cursor) || isBoundaryElement(cursor)) { + break; + } + + if (isDidaskaliaParagraph(cursor)) { + const nextNonDidaskalia = findNextNonDidaskaliaSibling(cursor); + if (!nextNonDidaskalia || isSpeakerBlock(nextNonDidaskalia)) { + break; + } + const toMove = cursor; + cursor = cursor.nextElementSibling; + node.appendChild(toMove); + didFix = true; + continue; + } + + if (isMovableParagraph(cursor)) { + const toMove = cursor; + cursor = cursor.nextElementSibling; + node.appendChild(toMove); + didFix = true; + continue; + } + + break; + } + } + + node = node.nextElementSibling; + } + + return didFix; +} + +export function fixLegacyPlayDidaskalia(html: string): string { + const dom = new JSDOM(html); + const doc = dom.window.document; + const sections = Array.from(doc.querySelectorAll("section[data-chapter]")); + const containers = sections.length ? sections : [doc.body]; + + let didFix = false; + for (const section of containers) { + if (!section.querySelector("div[data-speaker][data-label]")) { + continue; + } + if (fixSection(section)) { + didFix = true; + } + } + + return didFix ? doc.body.innerHTML : html; +} diff --git a/apps/pipeline/src/tools/fix-legacy-play-multi-speaker.spec.ts b/apps/pipeline/src/tools/fix-legacy-play-multi-speaker.spec.ts new file mode 100644 index 00000000..c519e4c6 --- /dev/null +++ b/apps/pipeline/src/tools/fix-legacy-play-multi-speaker.spec.ts @@ -0,0 +1,39 @@ +import { describe, expect, it } from "vitest"; +import { + applyMultiSpeakerMapToHtml, + extractMultiSpeakerNextLineMapFromXml, +} from "./fix-legacy-play-multi-speaker"; + +describe("extractMultiSpeakerNextLineMapFromXml", () => { + it("maps the next line id to multiple speaker slugs", () => { + const xml = ` + +

ALL

+

Fair is foul, and foul is fair:

+
+ `; + + const map = extractMultiSpeakerNextLineMapFromXml(xml); + expect(map.get("ch1-p22-s1")).toEqual(["first-witch", "second-witch", "third-witch"]); + }); +}); + +describe("applyMultiSpeakerMapToHtml", () => { + it("updates data-speaker based on the next line id", () => { + const html = ` +
+
+

Fair is foul, and foul is fair:

+

Hover through the fog and filthy air.

+
+
+ `; + + const map = new Map([ + ["ch1-p22-s1", ["first-witch", "second-witch", "third-witch"]], + ]); + + const result = applyMultiSpeakerMapToHtml(html, map); + expect(result).toContain('data-speaker="first-witch second-witch third-witch"'); + }); +}); diff --git a/apps/pipeline/src/tools/fix-legacy-play-multi-speaker.ts b/apps/pipeline/src/tools/fix-legacy-play-multi-speaker.ts new file mode 100644 index 00000000..2284c853 --- /dev/null +++ b/apps/pipeline/src/tools/fix-legacy-play-multi-speaker.ts @@ -0,0 +1,85 @@ +import { DOMParser, type Element as XMLElement } from "@xmldom/xmldom"; +import { JSDOM } from "jsdom"; + +function slugifyTagName(tagName: string): string { + return tagName + .toLowerCase() + .replace(/[^a-z0-9]+/g, "-") + .replace(/^-+|-+$/g, ""); +} + +function findFirstId(element: XMLElement): string | null { + const walker = element.getElementsByTagName("*"); + for (let i = 0; i < walker.length; i += 1) { + const node = walker.item(i) as XMLElement | null; + const id = node?.getAttribute("id"); + if (id) return id; + } + return null; +} + +function getTalkingTags(p: XMLElement): XMLElement[] { + const all = p.getElementsByTagName("*"); + const result: XMLElement[] = []; + for (let i = 0; i < all.length; i += 1) { + const node = all.item(i) as XMLElement | null; + if (node && node.getAttribute("talking") === "true") { + result.push(node); + } + } + return result; +} + +export function extractMultiSpeakerNextLineMapFromXml(xml: string): Map { + const parser = new DOMParser(); + const doc = parser.parseFromString(xml, "text/html"); + const paragraphs = Array.from(doc.getElementsByTagName("p")) as XMLElement[]; + const map = new Map(); + + for (let i = 0; i < paragraphs.length; i += 1) { + const p = paragraphs[i]; + const talking = getTalkingTags(p); + if (talking.length < 2) continue; + if (p.getElementsByTagName("strong").length === 0) continue; + + const speakers = Array.from( + new Set(talking.map((node) => slugifyTagName(node.tagName))), + ).filter(Boolean); + if (speakers.length < 2) continue; + + let nextId: string | null = null; + for (let j = i + 1; j < paragraphs.length; j += 1) { + nextId = findFirstId(paragraphs[j]); + if (nextId) break; + } + + if (nextId) { + map.set(nextId, speakers); + } + } + + return map; +} + +export function applyMultiSpeakerMapToHtml(html: string, map: Map): string { + if (map.size === 0) return html; + + const dom = new JSDOM(html); + const doc = dom.window.document; + const speakerBlocks = Array.from(doc.querySelectorAll("div[data-speaker][data-label]")); + let didFix = false; + + for (const block of speakerBlocks) { + const firstId = block.querySelector("[id]")?.getAttribute("id") ?? null; + if (!firstId) continue; + const speakers = map.get(firstId); + if (!speakers || speakers.length < 2) continue; + const joined = speakers.join(" "); + if (block.getAttribute("data-speaker") !== joined) { + block.setAttribute("data-speaker", joined); + didFix = true; + } + } + + return didFix ? doc.body.innerHTML : html; +} diff --git a/apps/pipeline/src/tools/fix-legacy-play-stage-directions.spec.ts b/apps/pipeline/src/tools/fix-legacy-play-stage-directions.spec.ts new file mode 100644 index 00000000..40cea2ac --- /dev/null +++ b/apps/pipeline/src/tools/fix-legacy-play-stage-directions.spec.ts @@ -0,0 +1,77 @@ +import { describe, expect, it } from "vitest"; +import { JSDOM } from "jsdom"; +import { fixLegacyPlayStageDirections } from "./fix-legacy-play-stage-directions"; + +function parseSection(html: string): Element { + const dom = new JSDOM(html); + const doc = dom.window.document; + const section = doc.querySelector("section[data-chapter]"); + if (!section) { + throw new Error("Missing section[data-chapter] in test input"); + } + return section; +} + +describe("fixLegacyPlayStageDirections", () => { + it("converts legacy character tags in stage directions to spans with data-c and data-enters", () => { + const input = ` +
+

+ Enter BENVOLIO and MERCUTIO +

+
+ `; + + const result = fixLegacyPlayStageDirections(input); + const section = parseSection(result); + + const benvolio = section.querySelector('span[data-c="benvolio"][data-enters="true"]'); + const mercutio = section.querySelector('span[data-c="mercutio"][data-enters="true"]'); + + expect(benvolio?.textContent).toBe("BENVOLIO"); + expect(mercutio?.textContent).toBe("MERCUTIO"); + + expect(section.querySelector("benvolio")).toBeNull(); + expect(section.querySelector("mercutio")).toBeNull(); + }); + + it("converts legacy exit tags to spans with data-exits", () => { + const input = ` +
+

+ Exeunt all but BENVOLIOSAMPSON +

+
+ `; + + const result = fixLegacyPlayStageDirections(input); + const section = parseSection(result); + + const benvolio = section.querySelector('span[data-c="benvolio"]'); + const sampson = section.querySelector('span[data-c="sampson"][data-exits="true"]'); + + expect(benvolio?.textContent).toBe("BENVOLIO"); + expect(sampson?.textContent).toBe("SAMPSON"); + }); + + it("preserves custom attributes and drops raw enters/exits attributes", () => { + const input = ` +
+

+ Enter CAPULET +

+
+ `; + + const result = fixLegacyPlayStageDirections(input); + const section = parseSection(result); + + const capulet = section.querySelector( + 'span[data-c="capulet"][data-enters="true"][dynasty="true"]', + ); + + expect(capulet?.textContent).toBe("CAPULET"); + expect(capulet?.hasAttribute("enters")).toBe(false); + expect(capulet?.hasAttribute("exits")).toBe(false); + }); +}); diff --git a/apps/pipeline/src/tools/fix-legacy-play-stage-directions.ts b/apps/pipeline/src/tools/fix-legacy-play-stage-directions.ts new file mode 100644 index 00000000..e0477c3a --- /dev/null +++ b/apps/pipeline/src/tools/fix-legacy-play-stage-directions.ts @@ -0,0 +1,113 @@ +import { JSDOM } from "jsdom"; + +const KNOWN_HTML_TAGS = new Set([ + "a", + "article", + "aside", + "b", + "blockquote", + "br", + "button", + "caption", + "code", + "div", + "em", + "h1", + "h2", + "h3", + "h4", + "h5", + "h6", + "hr", + "i", + "img", + "li", + "main", + "ol", + "p", + "section", + "small", + "span", + "strong", + "sub", + "sup", + "table", + "tbody", + "td", + "th", + "thead", + "tr", + "u", + "ul", +]); + +function slugifyTag(tagName: string): string { + return tagName + .toLowerCase() + .replace(/[^a-z0-9]+/g, "-") + .replace(/^-+|-+$/g, ""); +} + +function isConvertibleStageDirectionNode(element: Element): boolean { + const tagName = element.tagName.toLowerCase(); + if (KNOWN_HTML_TAGS.has(tagName)) return false; + if (!element.closest("em")) return false; + return true; +} + +function getAttrValue(element: Element, name: string): string | null { + if (element.hasAttribute(name)) return element.getAttribute(name); + return null; +} + +function copyAttributes(target: Element, source: Element, skip: Set): void { + for (const attr of Array.from(source.attributes)) { + const name = attr.name.toLowerCase(); + if (skip.has(name)) continue; + target.setAttribute(attr.name, attr.value); + } +} + +export function fixLegacyPlayStageDirections(html: string): string { + const dom = new JSDOM(html); + const doc = dom.window.document; + let didFix = false; + + const allElements = Array.from(doc.querySelectorAll("*")); + for (const element of allElements) { + if (!isConvertibleStageDirectionNode(element)) continue; + + const tagName = element.tagName.toLowerCase(); + const slug = slugifyTag(tagName); + if (!slug) continue; + + const span = doc.createElement("span"); + span.setAttribute("data-c", slug); + + const entersValue = getAttrValue(element, "enters") ?? getAttrValue(element, "data-enters"); + const exitsValue = getAttrValue(element, "exits") ?? getAttrValue(element, "data-exits"); + + copyAttributes( + span, + element, + new Set(["enters", "exits", "data-enters", "data-exits", "talking"]), + ); + + if (entersValue !== null) { + span.setAttribute("data-enters", entersValue || "true"); + } + if (exitsValue !== null) { + span.setAttribute("data-exits", exitsValue || "true"); + } + + const text = (element.textContent || "").trim(); + if (text.length > 0) { + span.textContent = text; + } + + element.replaceWith(span); + didFix = true; + } + + return didFix ? doc.body.innerHTML : html; +} diff --git a/apps/pipeline/src/tools/fix-non-play-custom-tags.spec.ts b/apps/pipeline/src/tools/fix-non-play-custom-tags.spec.ts new file mode 100644 index 00000000..a4a947bd --- /dev/null +++ b/apps/pipeline/src/tools/fix-non-play-custom-tags.spec.ts @@ -0,0 +1,119 @@ +import { describe, expect, it } from "vitest"; +import { JSDOM } from "jsdom"; +import { fixNonPlayCustomTags } from "./fix-non-play-custom-tags"; + +function parseSection(html: string): Element { + const dom = new JSDOM(html); + const doc = dom.window.document; + const section = doc.querySelector("section[data-chapter]"); + if (!section) { + throw new Error("Missing section[data-chapter] in test input"); + } + return section; +} + +describe("fixNonPlayCustomTags", () => { + it("converts note tags to link-note anchors", () => { + const input = ` +
+

Text beforeafter.

+
+ `; + + const result = fixNonPlayCustomTags(input); + const section = parseSection(result); + + const note = section.querySelector('a.link-note[data-note="2"]'); + expect(note?.textContent).toBe("2"); + expect(section.querySelector("note")).toBeNull(); + }); + + it("converts self-closing note tags to link-note anchors", () => { + const input = ` +
+

See here.

+
+ `; + + const result = fixNonPlayCustomTags(input); + const section = parseSection(result); + + const note = section.querySelector('a.link-note[data-note="448"]'); + expect(note?.textContent).toBe("448"); + expect(section.querySelector("note")).toBeNull(); + }); + + it("converts inline custom tags to spans with data-c", () => { + const input = ` +
+

Hyades, Hastur, and Aldebaran.

+
+ `; + + const result = fixNonPlayCustomTags(input); + const section = parseSection(result); + + const hastur = section.querySelector('span[data-c="hastur"]'); + expect(hastur?.textContent).toBe("Hastur"); + expect(section.querySelector("hastur")).toBeNull(); + }); + + it("promotes empty custom tags at start of a paragraph to data-speaker", () => { + const input = ` +
+

— Stój! Pal!

+
+ `; + + const result = fixNonPlayCustomTags(input); + const section = parseSection(result); + const paragraph = section.querySelector('p[data-index="123"]'); + + expect(paragraph?.getAttribute("data-speaker")).toBe("stoj-pal"); + expect(section.querySelector("stoj-pal")).toBeNull(); + expect(paragraph?.textContent?.trim().startsWith("— Stój! Pal!")).toBe(true); + }); + + it("treats self-closing talking tags at start as speakers", () => { + const input = ` +
+

'How doth the little crocodile

+
+ `; + + const result = fixNonPlayCustomTags(input); + const section = parseSection(result); + const paragraph = section.querySelector("p.verse"); + + expect(paragraph?.getAttribute("data-speaker")).toBe("alice"); + expect(section.querySelector("alice")).toBeNull(); + }); + + it("leaves hgroup tags intact", () => { + const input = ` +
+

Title

+
+ `; + + const result = fixNonPlayCustomTags(input); + const section = parseSection(result); + expect(section.querySelector("hgroup")).not.toBeNull(); + }); + + it("handles invalid see tags by preserving the reference", () => { + const input = ` +
+

+
+ `; + + const result = fixNonPlayCustomTags(input); + const section = parseSection(result); + + const see = section.querySelector('span[data-see="05.05.Sketch.gif"]'); + expect(see).not.toBeNull(); + expect(section.querySelector('span[data-c="see"]')).toBeNull(); + expect(section.querySelector("see")).toBeNull(); + }); +}); diff --git a/apps/pipeline/src/tools/fix-non-play-custom-tags.ts b/apps/pipeline/src/tools/fix-non-play-custom-tags.ts new file mode 100644 index 00000000..324be81e --- /dev/null +++ b/apps/pipeline/src/tools/fix-non-play-custom-tags.ts @@ -0,0 +1,268 @@ +import { JSDOM } from "jsdom"; + +const HTML_TAGS = new Set([ + "a", + "abbr", + "address", + "article", + "aside", + "audio", + "b", + "bdi", + "bdo", + "blockquote", + "body", + "br", + "button", + "canvas", + "caption", + "cite", + "code", + "col", + "colgroup", + "data", + "datalist", + "dd", + "del", + "details", + "dfn", + "dialog", + "div", + "dl", + "dt", + "em", + "fieldset", + "figcaption", + "figure", + "footer", + "form", + "h1", + "h2", + "h3", + "h4", + "h5", + "h6", + "head", + "header", + "hr", + "html", + "i", + "img", + "input", + "ins", + "kbd", + "label", + "legend", + "li", + "link", + "main", + "map", + "mark", + "meta", + "meter", + "nav", + "noscript", + "object", + "ol", + "optgroup", + "option", + "output", + "p", + "param", + "picture", + "pre", + "progress", + "q", + "rp", + "rt", + "ruby", + "s", + "samp", + "script", + "section", + "select", + "small", + "source", + "span", + "strong", + "style", + "sub", + "summary", + "sup", + "table", + "tbody", + "td", + "template", + "textarea", + "tfoot", + "th", + "thead", + "time", + "title", + "tr", + "track", + "u", + "ul", + "var", + "video", + "wbr", +]); + +const IGNORED_CUSTOM_TAGS = new Set(["hgroup"]); +const NON_SPEAKER_TAGS = new Set(["note", "see"]); + +function slugifyTag(tagName: string): string { + return tagName + .toLowerCase() + .replace(/[^a-z0-9]+/g, "-") + .replace(/^-+|-+$/g, ""); +} + +function copyAttributes(target: Element, source: Element, skip: Set): void { + for (const attr of Array.from(source.attributes)) { + const name = attr.name.toLowerCase(); + if (skip.has(name)) continue; + target.setAttribute(attr.name, attr.value); + } +} + +function normalizeInvalidSeeTags(html: string): string { + return html.replace(/]+)>/gi, (match, rawValue) => { + const value = String(rawValue).trim(); + if (!value || value.includes("=")) { + return match; + } + const escaped = value.replace(/"/g, """); + return ``; + }); +} + +function isCustomTag(tagName: string): boolean { + const lower = tagName.toLowerCase(); + if (HTML_TAGS.has(lower)) return false; + if (IGNORED_CUSTOM_TAGS.has(lower)) return false; + return true; +} + +function findFirstSignificantElement(p: Element): Element | null { + for (const node of Array.from(p.childNodes)) { + if (node.nodeType === 3) { + if (node.textContent?.trim()) return null; + continue; + } + if (node.nodeType === 1) return node as Element; + } + return null; +} + +function isEmptyElement(element: Element): boolean { + if (element.children.length > 0) return false; + return !(element.textContent ?? "").trim(); +} + +function unwrapElement(element: Element): void { + const parent = element.parentNode; + if (!parent) return; + while (element.firstChild) { + parent.insertBefore(element.firstChild, element); + } + parent.removeChild(element); +} + +function convertNotes(doc: Document): boolean { + let didFix = false; + const notes = Array.from(doc.querySelectorAll("note")); + for (const note of notes) { + const id = note.getAttribute("id") ?? ""; + const anchor = doc.createElement("a"); + anchor.className = "link-note"; + if (id) { + anchor.setAttribute("data-note", id); + anchor.textContent = id; + } + note.replaceWith(anchor); + didFix = true; + } + return didFix; +} + +function promoteSpeakerTags(doc: Document): boolean { + let didFix = false; + const paragraphs = Array.from(doc.querySelectorAll("p")); + for (const p of paragraphs) { + const firstElement = findFirstSignificantElement(p); + if (!firstElement) continue; + const tagName = firstElement.tagName.toLowerCase(); + if (!isCustomTag(tagName)) continue; + if (NON_SPEAKER_TAGS.has(tagName)) continue; + const hasTalking = firstElement.getAttribute("talking") === "true"; + if (!hasTalking && !isEmptyElement(firstElement)) continue; + + const slug = slugifyTag(tagName); + if (!slug) continue; + if (!p.hasAttribute("data-speaker")) { + p.setAttribute("data-speaker", slug); + } + if (firstElement.childNodes.length > 0) { + unwrapElement(firstElement); + } else { + firstElement.remove(); + } + didFix = true; + } + return didFix; +} + +function convertInlineCustomTags(doc: Document): boolean { + let didFix = false; + const elements = Array.from(doc.querySelectorAll("*")); + for (const element of elements) { + const tagName = element.tagName.toLowerCase(); + if (!isCustomTag(tagName)) continue; + if (tagName === "note") continue; + if (IGNORED_CUSTOM_TAGS.has(tagName)) continue; + + if (tagName === "see") { + const span = doc.createElement("span"); + copyAttributes(span, element, new Set(["talking"])); + const text = (element.textContent || "").trim(); + if (text.length > 0) { + span.textContent = text; + } + element.replaceWith(span); + didFix = true; + continue; + } + + const slug = slugifyTag(tagName); + if (!slug) continue; + + const span = doc.createElement("span"); + span.setAttribute("data-c", slug); + copyAttributes(span, element, new Set(["talking"])); + + const text = (element.textContent || "").trim(); + if (text.length > 0) { + span.textContent = text; + } + + element.replaceWith(span); + didFix = true; + } + return didFix; +} + +export function fixNonPlayCustomTags(html: string): string { + const normalized = normalizeInvalidSeeTags(html); + const dom = new JSDOM(normalized); + const doc = dom.window.document; + + const didFixNotes = convertNotes(doc); + const didFixSpeakers = promoteSpeakerTags(doc); + const didFixInline = convertInlineCustomTags(doc); + + if (didFixNotes || didFixSpeakers || didFixInline || normalized !== html) { + return doc.body.innerHTML; + } + return html; +} diff --git a/apps/pipeline/src/tools/getParagraphsFromChapterWithText.spec.ts b/apps/pipeline/src/tools/getParagraphsFromChapterWithText.spec.ts new file mode 100644 index 00000000..21581639 --- /dev/null +++ b/apps/pipeline/src/tools/getParagraphsFromChapterWithText.spec.ts @@ -0,0 +1,34 @@ +import { describe, it, expect } from "vitest"; +import { getParagraphsFromChapterWithText } from "./getParagraphsFromChapterWithText"; + +describe("getParagraphsFromChapterWithText", () => { + it("preserves curly double quotes inside attribute values", () => { + const bookText = ` +
+
+ A note with “ll” and “and” visible. +
+
+ `; + + const paragraphs = getParagraphsFromChapterWithText(1, bookText); + const html = paragraphs[0]?.text ?? ""; + + expect(html).toContain("“ll”"); + expect(html).toContain("“and”"); + expect(html).not.toContain('"ll"'); + }); + + it("preserves curly double quotes in text content", () => { + const bookText = ` +
+

He said “hello” and left.

+
+ `; + + const paragraphs = getParagraphsFromChapterWithText(1, bookText); + const html = paragraphs[0]?.text ?? ""; + + expect(html).toContain("“hello”"); + }); +}); diff --git a/apps/pipeline/src/tools/getParagraphsFromChapterWithText.ts b/apps/pipeline/src/tools/getParagraphsFromChapterWithText.ts index f475ba3b..5ed2a66b 100644 --- a/apps/pipeline/src/tools/getParagraphsFromChapterWithText.ts +++ b/apps/pipeline/src/tools/getParagraphsFromChapterWithText.ts @@ -46,8 +46,6 @@ export const getParagraphsFromChapterWithText = ( .filter((element) => element?.text.length > 0) .map((pageText, index) => { const text = pageText.text - .replace(/\u201c/g, '"') - .replace(/\u201d/g, '"') .replace(/\u2019/g, "'") .replace(/\u2018/g, "'") .replace(/\u2013/g, "-") diff --git a/apps/pipeline/src/tools/identifyEntityAndRewriteParagraphs.ts b/apps/pipeline/src/tools/identifyEntityAndRewriteParagraphs.ts index c5afb17f..072f3457 100644 --- a/apps/pipeline/src/tools/identifyEntityAndRewriteParagraphs.ts +++ b/apps/pipeline/src/tools/identifyEntityAndRewriteParagraphs.ts @@ -7,6 +7,7 @@ import { logger } from "../logger"; import fs from "fs"; import { compareXmlTextContent } from "./new-tooling/compare-chapters-xml"; import { restoreOriginalTextInHtml } from "./new-tooling/restore-text-in-html"; +import { restoreUnwrappedBlocks } from "./new-tooling/restore-unwrapped-blocks"; import path from "path"; import { type NewReferenceCardsResponse } from "../types"; import { writeBookFile } from "../helpers/writeBookFile"; @@ -148,6 +149,15 @@ async function processChunk( logger.error(`Error restoring original text for chapter ${chapter} chunk ${chunkIndex}`, e); } + try { + restored = restoreUnwrappedBlocks(originalChunkXml, restored); + } catch (e) { + logger.error( + `Error restoring unwrapped blocks for chapter ${chapter} chunk ${chunkIndex}`, + e, + ); + } + if (restored && compareXmlTextContent(originalChunkXml, restored)) { logger.info(`✅ Chunk ${chunkIndex} validated for chapter ${chapter}`); writeBookFile(`${chunkFileName.replace(".xml", "")}-${selectedProvider.name}.xml`, restored); @@ -271,10 +281,8 @@ export const identifyAndRewriteParagraphs = async ( writeBookFile(`compiled-prompt-for-chapter-${chapter}-gemini2.md`, compiledPrompt); - // const llmProviders = [callGeminiWrapper, callClaude]; - const llmProviders = [callGeminiWrapper, callGrok, callClaude, callGpt5]; - // const llmProviders = [callGeminiWrapper]; + try { const selectedProvider = llmProviders[attempt % llmProviders.length]; logger.info("Using provider: " + selectedProvider.name); @@ -293,6 +301,12 @@ export const identifyAndRewriteParagraphs = async ( logger.error("Error restoring original text for chapter " + chapter, e); } + try { + restored = restoreUnwrappedBlocks(paragraphsForPage, restored); + } catch (e) { + logger.error("Error restoring unwrapped blocks for chapter " + chapter, e); + } + if (restored && compareXmlTextContent(paragraphsForPage, restored)) { // Build section attributes string, including format and any preserved epub-type const formatAttr = chapterFormat !== "prose" ? ` data-chapter-format="${chapterFormat}"` : ""; @@ -337,10 +351,9 @@ export const identifyCharactersAndRewriteParagraphs = async ( ) => { const bookSettings = getBookSettings(); - const charactersForChapter = referenceCards.characters.map((c) => ({ - name: c.name, - summary: c.referenceCard, - })); + const charactersForChapter = referenceCards.characters + .filter((c) => c.name !== "generic-avatar") // Exclude synthetic generic-avatar from LLM prompts + .map((c) => ({ name: c.name, summary: c.referenceCard })); const jsonCharacters = buildJsonCharacters(charactersForChapter); // Prepare all chapter data diff --git a/apps/pipeline/src/tools/importScannedBook.ts b/apps/pipeline/src/tools/importScannedBook.ts index 8b0cec71..b47969aa 100644 --- a/apps/pipeline/src/tools/importScannedBook.ts +++ b/apps/pipeline/src/tools/importScannedBook.ts @@ -27,6 +27,7 @@ import type { ChapterDetectionResult } from "../scan-server/chapterDetector"; import type { BookAnalysis, BookCharacter } from "../scan-server/ocrSchema"; import { generateBookHtml, type GeneratedChapterHtml } from "../scan-server/htmlGenerator"; import { generateCharacterImageWithOpenAI } from "./new-tooling/generate-pictures-for-entities"; +import { computeParagraphCount } from "../lib/paragraphCount"; import "dotenv/config"; const SCANNED_BOOKS_DIR = path.resolve(__dirname, "../../scanned-books"); @@ -386,8 +387,9 @@ async function step4_ImportChapters( console.log(` Importing ${chapters.length} chapters`); for (const chapter of chapters) { + const paragraphCount = computeParagraphCount(chapter.html); console.log( - ` Chapter ${chapter.chapterNumber}: ${chapter.title || "(no title)"} (${chapter.paragraphCount} paragraphs)`, + ` Chapter ${chapter.chapterNumber}: ${chapter.title || "(no title)"} (${paragraphCount} paragraphs)`, ); // Upload HTML content @@ -408,6 +410,7 @@ async function step4_ImportChapters( basename, chapterNumber: chapter.chapterNumber, title: chapter.title || undefined, + paragraphCount, sourceFormat: "HTML", }); } diff --git a/apps/pipeline/src/tools/importScannedBookIncremental.ts b/apps/pipeline/src/tools/importScannedBookIncremental.ts index 954375b9..6a3be17c 100644 --- a/apps/pipeline/src/tools/importScannedBookIncremental.ts +++ b/apps/pipeline/src/tools/importScannedBookIncremental.ts @@ -15,6 +15,7 @@ import type { DetectedChapter } from "../scan-server/chapterDetector"; import type { ChapterAnalysis, BookCharacter } from "../scan-server/ocrSchema"; import { generateChapterHtml } from "../scan-server/htmlGenerator"; import { generateCharacterImageWithOpenAI } from "./new-tooling/generate-pictures-for-entities"; +import { computeParagraphCount } from "../lib/paragraphCount"; const SCANNED_BOOKS_DIR = path.resolve(__dirname, "../../scanned-books"); @@ -330,6 +331,7 @@ async function importChapter( // Upload HTML const folderPath = `${bookPath}/chapters-source`; const basename = `chapter-${chapter.chapterNumber}.html`; + const paragraphCount = computeParagraphCount(generated.html); await convex.uploadFile({ folderPath, @@ -345,6 +347,7 @@ async function importChapter( basename, chapterNumber: chapter.chapterNumber, title: chapter.title || undefined, + paragraphCount, sourceFormat: "HTML", }); diff --git a/apps/pipeline/src/tools/new-tooling/generate-flux-schnel-image.ts b/apps/pipeline/src/tools/new-tooling/generate-flux-schnel-image.ts index 5a42c43f..36842ad4 100644 --- a/apps/pipeline/src/tools/new-tooling/generate-flux-schnel-image.ts +++ b/apps/pipeline/src/tools/new-tooling/generate-flux-schnel-image.ts @@ -8,6 +8,7 @@ import { sanitizePromptForModeration, generateAbstractPortraitPrompt, } from "./generate-pictures-for-entities"; +import { logError } from "src/helpers/logError"; const replicate = new Replicate({ auth: process.env.REPLICATE_API_TOKEN }); @@ -138,22 +139,47 @@ export const generateFluxImage = async ( finalPrompt = `${generalPrompt} Only scene-setting environment. ${prompt}`; } + // this is input for flux-2-pro + // const input = { + // aspect_ratio: type === "avatar" ? "1:1" : "16:9", + // input_images: [], + // output_format: type === "background" ? "webp" : "png", + // output_quality: 80, + // prompt: finalPrompt, + // resolution: "1 MP", + // safety_tolerance: 5, + // seed: 43605, + // }; + const input = { + images: [], + prompt: finalPrompt, + go_fast: false, aspect_ratio: type === "avatar" ? "1:1" : "16:9", - input_images: [], output_format: type === "background" ? "webp" : "png", output_quality: 80, - prompt: finalPrompt, - resolution: "1 MP", - safety_tolerance: 5, - seed: 43605, + output_megapixels: "1", + disable_safety_checker: true, }; try { - const output = await replicate.run("black-forest-labs/flux-2-pro", { input }); + const output = await replicate.run("black-forest-labs/flux-2-klein-4b", { input }); - // @ts-expect-error wrong types of replicate - flux-2-pro returns object with .url() method - const url = output.url(); + let url: string; + try { + // @ts-expect-error wrong types of replicate - flux-2-pro returns object with .url() method + url = output.url(); + } catch { + try { + url = (output as unknown as { url: () => string }[])[0].url(); + } catch (e) { + logError("Failed to get URL from output", e); + } + } + + if (!url!) { + throw new Error("Failed to get URL from output"); + } logger.info(`Replicate returned URL: ${url}`); @@ -198,7 +224,7 @@ export const generateFluxImage = async ( if (require.main === module) { generateFluxImage( - "A beautiful woman with long brown hair and blue eyes", + "A beautiful woman with long brown hair and blue eyes, seductive pose, naked, red lips, sexy, elegant, beautiful, 18 years old, breasts exposed", "test", "SinCity style", "avatar", diff --git a/apps/pipeline/src/tools/new-tooling/generate-pictures-for-entities.ts b/apps/pipeline/src/tools/new-tooling/generate-pictures-for-entities.ts index ff32a7da..3e41542b 100644 --- a/apps/pipeline/src/tools/new-tooling/generate-pictures-for-entities.ts +++ b/apps/pipeline/src/tools/new-tooling/generate-pictures-for-entities.ts @@ -331,18 +331,6 @@ Propaganda posters for their graphic boldness and limited color palette. filteredPrompts.map(async (prompt) => { if (!knownCharactersArray.includes(prompt.name)) { console.log("Generating for ", prompt.name); - - // const translationPrompt = `Process the following draft of a visual prompt: "${prompt.visualGuide}". Remove relations (who is a cousin to who, etc), information about what happens to that person, etc. - // Remove any indication of episodic things, for example someone getting a bruise later. Leave this as a purely visual information, based on what we know. - // Remove any indication of nudity, sexual content, etc. Remove suggestions that someone is naked or descriptions of private body parts. - // If prompt in different language than English, translate it to English. - // Reply with prompt directly, without any other text, so this can be used directly as a prompt for image generation. Do not say: "Here is the prompt" or "understood", just reply with the prompt.`; - // const visulGuideTranslatedAndCleaned = await callClaude(translationPrompt, undefined, 10, 0); - - // console.log(visulGuideTranslatedAndCleaned); - // const image = await generateImage(visulGuideTranslatedAndCleaned, prompt.name); - // const image = await generateImage(visulGuideTranslatedAndCleaned, prompt.name); - await generateAndSaveCharacterImage(prompt.visualGuide, prompt.name, generalPrompt); } }), diff --git a/apps/pipeline/src/tools/new-tooling/get-chapter-by-chapter-summary.ts b/apps/pipeline/src/tools/new-tooling/get-chapter-by-chapter-summary.ts index c0b4d857..e5453e3a 100644 --- a/apps/pipeline/src/tools/new-tooling/get-chapter-by-chapter-summary.ts +++ b/apps/pipeline/src/tools/new-tooling/get-chapter-by-chapter-summary.ts @@ -1,10 +1,11 @@ -import { callClaude, callGeminiWrapper } from "../../callClaude"; +import { callGeminiWrapper } from "../../callClaude"; import { getChaptersUpTo } from "../../helpers/getChaptersUpTo"; import { getBookSettings } from "../../helpers/getBookSettings"; import { writeBookFile } from "../../helpers/writeBookFile"; import { readBookFile } from "../../helpers/readBookFile"; import { FILE_TYPE } from "../../helpers/filesHelpers"; import { logger } from "../../logger"; +import { callGrokAzure } from "src/callGrokAzure"; export const makeRollingChapterSummaries = async () => { const bookSettings = getBookSettings(); @@ -78,7 +79,7 @@ Provide your summary clearly organized according to the structure above, explici `; // Use `prompt` with your LLM here and store the output as `summary` - const llmProviders = [callGeminiWrapper, callClaude]; + const llmProviders = [callGrokAzure, callGeminiWrapper]; const selectedProvider = llmProviders[attempt % llmProviders.length]; try { const summary = await selectedProvider( diff --git a/apps/pipeline/src/tools/new-tooling/get-chapter-by-chapter-with-paragraphs-json-summary.ts b/apps/pipeline/src/tools/new-tooling/get-chapter-by-chapter-with-paragraphs-json-summary.ts index 49c6e6f4..cac1c095 100644 --- a/apps/pipeline/src/tools/new-tooling/get-chapter-by-chapter-with-paragraphs-json-summary.ts +++ b/apps/pipeline/src/tools/new-tooling/get-chapter-by-chapter-with-paragraphs-json-summary.ts @@ -7,6 +7,8 @@ import { readBookFile } from "../../helpers/readBookFile"; import { FILE_TYPE } from "../../helpers/filesHelpers"; import { writeBookFile } from "../../helpers/writeBookFile"; import { callSlowGeminiWithThinkingAndSchemaAndParsed } from "../../callFastGemini"; +import { callGrokAzureWithSchema } from "src/callGrokAzure"; +import { buildParagraphsForSummary } from "./summaryParagraphs"; // Define the schema for reference cards response const ScenesSummariesPerChapterSchema = z.object({ @@ -41,12 +43,7 @@ export async function generateSingleChapterSummary( ): Promise { const { chapterNum, paragraphs, rollingSummary, bookLanguage = "English" } = options; - const paragraphsForPage = paragraphs - .map( - (paragraph) => - `

${paragraph.text.trim().replace(/"/g, "'")}

`, - ) - .join("\n"); + const paragraphsForPage = buildParagraphsForSummary(paragraphs); const prompt = ` ## Fiction Book Chapter Summary @@ -169,12 +166,7 @@ export const turnChapterSummariesIntoBulletPointsMappedToParagraphs = async () = const paragraphsFromChapter = getParagraphsFromChapter(chapterNum, true, true); - const paragraphsForPage = paragraphsFromChapter - .map( - (paragraph) => - `

${paragraph.text.trim().replace(/"/g, "'")}

`, - ) - .join("\n"); + const paragraphsForPage = buildParagraphsForSummary(paragraphsFromChapter); const prompt = ` ## Fiction Book Chapter Summary @@ -235,17 +227,16 @@ Provide your summary clearly organized according to the structure above, explici let summary: ScenesSummariesPerChapter; try { - summary = (await callSlowGeminiWithThinkingAndSchemaAndParsed( + summary = (await callGrokAzureWithSchema( `${prompt}\n Reply in the language of the book. It's usually Polish or English. Your instructions are in English so you often reply in English, buts its VERY important to reply in Polish when the book is in Polish, and same goes for other languages..`, ScenesSummariesPerChapterSchema, )) as ScenesSummariesPerChapter; } catch (e) { console.error(`Error for chapter ${chapterNum}`, e); try { - summary = (await callClaude( + summary = (await callSlowGeminiWithThinkingAndSchemaAndParsed( `${prompt}\n Reply in the language of the book. It's usually Polish or English. Your instructions are in English so you often reply in English, buts its VERY important to reply in Polish when the book is in Polish, and same goes for other languages.`, ScenesSummariesPerChapterSchema, - 2, )) as ScenesSummariesPerChapter; } catch (e) { console.error(`Error for chapter ${chapterNum}`, e); diff --git a/apps/pipeline/src/tools/new-tooling/get-chapter-title.spec.ts b/apps/pipeline/src/tools/new-tooling/get-chapter-title.spec.ts new file mode 100644 index 00000000..7abdf0bd --- /dev/null +++ b/apps/pipeline/src/tools/new-tooling/get-chapter-title.spec.ts @@ -0,0 +1,107 @@ +import { DOMParser, type Element as XMLElement } from "@xmldom/xmldom"; +import { describe, it, expect } from "vitest"; +import { getChapterTitle } from "./get-chapter-title"; + +describe("getChapterTitle", () => { + it("should return the chapter title", () => { + const chapter = `Chapter 1Content 1`; + const parser = new DOMParser(); + const doc = parser.parseFromString(chapter, "text/xml"); + const root = doc.documentElement as XMLElement; + + expect(getChapterTitle(root)).toBe("Chapter 1"); + }); + + it("should handle hgroup with label, ordinal, and title", () => { + const chapter = `
+
+

+ Book + II +

+

The Castle

+
+
`; + const parser = new DOMParser(); + const doc = parser.parseFromString(chapter, "text/xml"); + const root = doc.documentElement as XMLElement; + + expect(getChapterTitle(root)).toBe("Book II: The Castle"); + }); + + it("should handle hgroup with ordinal and title", () => { + const chapter = `
+
+

I

+

I Go to Styles

+
+
`; + const parser = new DOMParser(); + const doc = parser.parseFromString(chapter, "text/xml"); + const root = doc.documentElement as XMLElement; + + expect(getChapterTitle(root)).toBe("I: I Go to Styles"); + }); + + it("should use data-epub-type as title when no hgroup with title exists", () => { + const chapter = `
+

To my Mother

+
`; + const parser = new DOMParser(); + const doc = parser.parseFromString(chapter, "text/xml"); + const root = doc.documentElement as XMLElement; + + expect(getChapterTitle(root)).toBe("Dedication"); + }); + + it("should handle hgroup with title but no h2", () => { + const chapter = `
+
+

Prologue

+
+
`; + const parser = new DOMParser(); + const doc = parser.parseFromString(chapter, "text/xml"); + const root = doc.documentElement as XMLElement; + + expect(getChapterTitle(root)).toBe("Prologue"); + }); + + it("should handle hgroup with title and h2 but no ordinal", () => { + const chapter = `
+
+

Introduction

+

The Beginning

+
+
`; + const parser = new DOMParser(); + const doc = parser.parseFromString(chapter, "text/xml"); + const root = doc.documentElement as XMLElement; + + expect(getChapterTitle(root)).toBe("The Beginning"); + }); + + it("should handle legacy chapter with act (h3) element", () => { + const chapter = ` +

Act I

+ The Opening +
`; + const parser = new DOMParser(); + const doc = parser.parseFromString(chapter, "text/xml"); + const root = doc.documentElement as XMLElement; + + expect(getChapterTitle(root)).toBe("Act I, The Opening"); + }); + + it("should handle legacy chapter with title and subtitle", () => { + const chapter = ` + Chapter One. + In which our hero begins + `; + const parser = new DOMParser(); + const doc = parser.parseFromString(chapter, "text/xml"); + const root = doc.documentElement as XMLElement; + + expect(getChapterTitle(root)).toBe("Chapter One, In which our hero begins"); + }); +}); diff --git a/apps/pipeline/src/tools/new-tooling/get-chapter-title.ts b/apps/pipeline/src/tools/new-tooling/get-chapter-title.ts index 5ef509f2..fd9bc463 100644 --- a/apps/pipeline/src/tools/new-tooling/get-chapter-title.ts +++ b/apps/pipeline/src/tools/new-tooling/get-chapter-title.ts @@ -2,16 +2,75 @@ import { type Element as XMLElement } from "@xmldom/xmldom"; const getTitleText = (el?: XMLElement | null) => (el ? (el.textContent || "").trim() : ""); -export const getChapterTitle = (chapter: XMLElement): string => { - let currentAct = ""; +const getAttribute = (el: XMLElement, name: string): string | null => { + const attr = el.getAttribute(name); + return attr ? attr.trim() : null; +}; + +const hasEpubType = (el: XMLElement, type: string): boolean => { + const epubType = getAttribute(el, "data-epub-type"); + return epubType ? epubType.includes(type) : false; +}; + +const extractLabelAndOrdinalFromSpans = (h2: XMLElement): { label: string; ordinal: string } => { + let label = ""; + let ordinal = ""; + + const spans = h2.getElementsByTagName("span"); + for (let i = 0; i < spans.length; i++) { + const span = spans[i] as XMLElement; + const spanEpubType = getAttribute(span, "data-epub-type"); + if (spanEpubType === "label") { + label = getTitleText(span); + } else if (spanEpubType && spanEpubType.includes("ordinal")) { + ordinal = getTitleText(span); + } + } - if (chapter.getElementsByTagName("h2").length > 0) { - console.warn("h2 found in chapter, not supported yet"); + return { label, ordinal }; +}; + +const formatTitleWithOrdinal = (label: string, ordinal: string, title: string): string => { + if (label && ordinal) { + return `${label} ${ordinal}: ${title}`; } - if (chapter.getElementsByTagName("h1").length > 0) { - console.warn("h1 found in chapter, not supported yet"); + if (ordinal) { + return `${ordinal}: ${title}`; + } + return title; +}; + +const getTitleFromHgroup = (hgroup: XMLElement): string | null => { + const titleParagraphs = Array.from(hgroup.getElementsByTagName("p")).filter((p) => + hasEpubType(p as XMLElement, "title"), + ); + + if (titleParagraphs.length === 0) { + return null; } + const titleText = getTitleText(titleParagraphs[0] as XMLElement); + const h2Elements = hgroup.getElementsByTagName("h2"); + + if (h2Elements.length === 0) { + return titleText; + } + + const h2 = h2Elements[0] as XMLElement; + + // Check if h2 itself has ordinal attribute + if (hasEpubType(h2, "ordinal")) { + return formatTitleWithOrdinal("", getTitleText(h2), titleText); + } + + // Check for spans within h2 + const { label, ordinal } = extractLabelAndOrdinalFromSpans(h2); + return formatTitleWithOrdinal(label, ordinal, titleText); +}; + +const getLegacyChapterTitle = (chapter: XMLElement): string => { + let currentAct = ""; + const actElements = chapter.getElementsByTagName("h3").length > 0 ? chapter.getElementsByTagName("h3") @@ -31,16 +90,36 @@ export const getChapterTitle = (chapter: XMLElement): string => { currentAct = getTitleText(actElements[0]); } + console.log(`titleElements: ${titleElements.length}`); + const titleText = getTitleText(titleElements[0]); const subtitleText = getTitleText(subtitleElements[0]); - const chapterTitle = [ + return [ currentAct, titleText && subtitleText ? titleText.replace(/\.$/, "") : titleText, subtitleText, ] .filter(Boolean) .join(", "); +}; + +export const getChapterTitle = (chapter: XMLElement): string => { + // Check for hgroup structure first + const hgroups = chapter.getElementsByTagName("hgroup"); + if (hgroups.length > 0) { + const hgroupTitle = getTitleFromHgroup(hgroups[0] as XMLElement); + if (hgroupTitle) { + return hgroupTitle; + } + } + + // If no hgroup with title, check for data-epub-type on the root element + const epubType = getAttribute(chapter, "data-epub-type"); + if (epubType) { + return epubType.charAt(0).toUpperCase() + epubType.slice(1); + } - return chapterTitle; + // Fall back to existing logic for backward compatibility + return getLegacyChapterTitle(chapter); }; diff --git a/apps/pipeline/src/tools/new-tooling/get-reference-cards-for-whole-book-prompt.md b/apps/pipeline/src/tools/new-tooling/get-reference-cards-for-whole-book-prompt.md index 5ab75ae5..519b60bf 100644 --- a/apps/pipeline/src/tools/new-tooling/get-reference-cards-for-whole-book-prompt.md +++ b/apps/pipeline/src/tools/new-tooling/get-reference-cards-for-whole-book-prompt.md @@ -1,7 +1,7 @@ # Task Process a text chapter by chapter. For each story character who appears or is mentioned, create reference cards reflecting the knowledge about this person - how would a human introduce someone to that character without spoiling it. The information should be based mostly on when we first meet the character, but pointing towards the knowledge we know about him from the whole book - so avoid spoilers, but use the later revealed facts to determine whats important about the initial impression. -Maybe when we first meet the character he is working on his car in his garage. If in the rest of his book he does that from time to time, or we learn he is a mechanic, or a driver, or whatever like that, that's important detail. But if he is not mentioned in the context of cars again, that is irrelevant detail. No spoilers! Do not mention how things end or who they become. Only the most generic but relevant information. So skip anything that's surprising or important action that happened in the book, but build the background about the person. Who that person was at when the story starts. Do not mention any important life changes, like getting married, dying, getting a promotion, unless it happened at the very moment we learn about that person. +Maybe when we first meet the character he is working on his car in his garage. If in the rest of his book he does that from time to time, or we learn he is a mechanic, or a driver, or whatever like that, that's important detail. But if he is not mentioned in the context of cars again, that is irrelevant detail. No spoilers! Do not mention how things end or who they become. Only the most generic but relevant information. So skip anything that's surprising or important action that happened later in the book, but build the background about the person. Who that person was when the story starts. Do not mention any important life changes, like getting married, dying, getting a promotion, unless it happened at the very moment we learn about that person. ## **Output Goal: Character-Centric History** @@ -39,7 +39,6 @@ Return the _final, complete_ results after processing _all_ chapters in the foll - Focus on reminding the reader who the character _is_ based on past context (role, relationships, key history), not what they _do_ or _say_ in the current chapter (N). Avoid spoilers! - Do not write more than 1-2 short sentences about the person. This is not a summary of a book, this is a memory-jog to quickly get someone to connect character name with the actual character. - If the person is known by two names, add the second one in the parentheses. -- This is not a summary of a book, this is a memory-jog to quickly get someone to connect character name with the actual character. - Jeśli tekst jest po Polsku, odpowiedz po Polsku ## Book text diff --git a/apps/pipeline/src/tools/new-tooling/get-reference-cards-for-whole-book.ts b/apps/pipeline/src/tools/new-tooling/get-reference-cards-for-whole-book.ts index ccd9e92e..73f2bfdb 100644 --- a/apps/pipeline/src/tools/new-tooling/get-reference-cards-for-whole-book.ts +++ b/apps/pipeline/src/tools/new-tooling/get-reference-cards-for-whole-book.ts @@ -52,7 +52,17 @@ ${knownCharactersMapped}\n\n` console.log("combinedPrompt length:", combinedPrompt.length); - return callGrokWithSchema(combinedPrompt, NewReferenceCardsResponseSchema); + const response = await callGrokWithSchema(combinedPrompt, NewReferenceCardsResponseSchema); + + // Add synthetic generic-avatar for unknown/minor speakers + // This will get an avatar generated but won't be passed to the rewrite prompts + response.characters.push({ + name: "generic-avatar", + referenceCard: + "A mysterious figure shown from behind or in silhouette. No distinct facial features visible. Enigmatic and anonymous, suitable for representing any unnamed character. Atmospheric lighting with the figure partially obscured by shadow or mist.", + }); + + return response; }; if (require.main === module) { diff --git a/apps/pipeline/src/tools/new-tooling/restore-unwrapped-blocks.spec.ts b/apps/pipeline/src/tools/new-tooling/restore-unwrapped-blocks.spec.ts new file mode 100644 index 00000000..9fa1dabf --- /dev/null +++ b/apps/pipeline/src/tools/new-tooling/restore-unwrapped-blocks.spec.ts @@ -0,0 +1,36 @@ +import { describe, expect, it } from "vitest"; +import { restoreUnwrappedBlocks } from "./restore-unwrapped-blocks"; + +describe("restoreUnwrappedBlocks", () => { + it("wraps dangling text and inline nodes using the original block element", () => { + const original = "

Alpha

Miss Howard nodded grimly.

Omega

"; + const model = "

Alpha

Miss Howard nodded grimly.

Omega

"; + + expect(restoreUnwrappedBlocks(original, model)).toBe( + "

Alpha

Miss Howard nodded grimly.

Omega

", + ); + }); + + it("preserves original attributes when rewrapping", () => { + const original = '

First

Second

'; + const model = '

First

Second'; + + expect(restoreUnwrappedBlocks(original, model)).toBe( + '

First

Second

', + ); + }); + + it("leaves unmatched dangling text untouched", () => { + const original = "

One

Two

"; + const model = "

One

Extra

Two

"; + + expect(restoreUnwrappedBlocks(original, model)).toBe("

One

Extra

Two

"); + }); + + it("returns the input when no dangling nodes exist", () => { + const original = "

One

Two

"; + const model = "

One

Two

"; + + expect(restoreUnwrappedBlocks(original, model)).toBe(model); + }); +}); diff --git a/apps/pipeline/src/tools/new-tooling/restore-unwrapped-blocks.ts b/apps/pipeline/src/tools/new-tooling/restore-unwrapped-blocks.ts new file mode 100644 index 00000000..f3b03c2b --- /dev/null +++ b/apps/pipeline/src/tools/new-tooling/restore-unwrapped-blocks.ts @@ -0,0 +1,166 @@ +import { DOMParser, XMLSerializer } from "@xmldom/xmldom"; +import type { Element as XMLElement, Node as XMLNode } from "@xmldom/xmldom"; + +type OriginalBlock = { element: XMLElement; tagName: string; normalizedText: string }; + +function normalizeText(text: string): string { + return text.replace(/\s+/g, " ").trim(); +} + +function getTextContent(node: XMLNode): string { + if (node.nodeType === node.TEXT_NODE) { + return node.nodeValue ?? ""; + } + if (node.nodeType === node.ELEMENT_NODE && node.childNodes) { + let text = ""; + for (let i = 0; i < node.childNodes.length; i++) { + text += getTextContent(node.childNodes[i]); + } + return text; + } + return ""; +} + +function getElementAttributes(element: XMLElement): Record { + const attrs: Record = {}; + if (!element.attributes) return attrs; + for (let i = 0; i < element.attributes.length; i++) { + const attr = element.attributes.item(i); + if (attr?.name) { + attrs[attr.name] = attr.value ?? ""; + } + } + return attrs; +} + +function buildAttributeString(attributes: Record): string { + const entries = Object.entries(attributes); + if (entries.length === 0) return ""; + return " " + entries.map(([key, value]) => `${key}="${value.replace(/"/g, """)}"`).join(" "); +} + +function stripXmlns(serialized: string): string { + return serialized.replace(/\s+xmlns="http:\/\/www\.w3\.org\/1999\/xhtml"/g, ""); +} + +function serializeNodes(nodes: XMLNode[], serializer: XMLSerializer): string { + return nodes.map((node) => stripXmlns(serializer.serializeToString(node))).join(""); +} + +function extractOriginalBlocks(section: XMLElement): OriginalBlock[] { + const blocks: OriginalBlock[] = []; + for (let i = 0; i < section.childNodes.length; i++) { + const node = section.childNodes[i]; + if (node.nodeType !== node.ELEMENT_NODE) continue; + const element = node as XMLElement; + const tagName = (element.tagName || "").toLowerCase(); + const normalizedText = normalizeText(getTextContent(element)); + blocks.push({ element, tagName, normalizedText }); + } + return blocks; +} + +function findMatchingOriginalIndex( + blocks: OriginalBlock[], + startIndex: number, + normalizedText: string, +): number { + if (!normalizedText) return -1; + for (let i = startIndex; i < blocks.length; i++) { + if (blocks[i].normalizedText === normalizedText) { + return i; + } + } + return -1; +} + +function wrapDanglingNodes( + originalElement: XMLElement, + danglingNodes: XMLNode[], + serializer: XMLSerializer, +): string { + const tagName = (originalElement.tagName || "").toLowerCase(); + const attrs = buildAttributeString(getElementAttributes(originalElement)); + const inner = serializeNodes(danglingNodes, serializer); + return `<${tagName}${attrs}>${inner}`; +} + +export function restoreUnwrappedBlocks(originalHtml: string, modelHtml: string): string { + if (originalHtml === modelHtml) return modelHtml; + + const parser = new DOMParser(); + const originalDoc = parser.parseFromString(`
${originalHtml}
`, "text/html"); + const modelDoc = parser.parseFromString(`
${modelHtml}
`, "text/html"); + + const originalSection = originalDoc.getElementsByTagName("section")[0]; + const modelSection = modelDoc.getElementsByTagName("section")[0]; + if (!originalSection || !modelSection) return modelHtml; + + const originalBlocks = extractOriginalBlocks(originalSection); + if (originalBlocks.length === 0) return modelHtml; + + const allowedTags = new Set(originalBlocks.map((block) => block.tagName)); + const serializer = new XMLSerializer(); + + const output: string[] = []; + let danglingNodes: XMLNode[] = []; + let changed = false; + let originalIndex = 0; + + const flushDangling = () => { + if (danglingNodes.length === 0) return; + + const danglingText = normalizeText(danglingNodes.map((node) => getTextContent(node)).join("")); + const matchIndex = findMatchingOriginalIndex(originalBlocks, originalIndex, danglingText); + + if (matchIndex >= 0) { + output.push(wrapDanglingNodes(originalBlocks[matchIndex].element, danglingNodes, serializer)); + originalIndex = matchIndex + 1; + changed = true; + } else { + output.push(serializeNodes(danglingNodes, serializer)); + } + + danglingNodes = []; + }; + + for (let i = 0; i < modelSection.childNodes.length; i++) { + const node = modelSection.childNodes[i]; + + if (node.nodeType === node.TEXT_NODE) { + const text = node.nodeValue ?? ""; + if (normalizeText(text) === "") { + continue; + } + danglingNodes.push(node); + continue; + } + + if (node.nodeType === node.ELEMENT_NODE) { + const element = node as XMLElement; + const tagName = (element.tagName || "").toLowerCase(); + + if (!allowedTags.has(tagName)) { + danglingNodes.push(node); + continue; + } + + flushDangling(); + + // Keep valid block element as-is + output.push(stripXmlns(serializer.serializeToString(element))); + + const elementText = normalizeText(getTextContent(element)); + const matchIndex = findMatchingOriginalIndex(originalBlocks, originalIndex, elementText); + if (matchIndex >= 0) { + originalIndex = matchIndex + 1; + } + continue; + } + } + + flushDangling(); + + if (!changed) return modelHtml; + return output.join(""); +} diff --git a/apps/pipeline/src/tools/new-tooling/section-wrapper.spec.ts b/apps/pipeline/src/tools/new-tooling/section-wrapper.spec.ts new file mode 100644 index 00000000..9bebbc68 --- /dev/null +++ b/apps/pipeline/src/tools/new-tooling/section-wrapper.spec.ts @@ -0,0 +1,37 @@ +import { describe, expect, it } from "vitest"; +import { buildSectionWrapper, extractSectionInner, parseAttributes } from "./section-wrapper"; + +describe("section-wrapper", () => { + it("parses attributes with quotes", () => { + const attrs = parseAttributes(' data-chapter="2" data-epub-type="chapter"'); + expect(attrs).toEqual({ "data-chapter": "2", "data-epub-type": "chapter" }); + }); + + it("parses attributes with mixed quotes and unquoted values", () => { + const attrs = parseAttributes(" data-id='x' data-num=3 data-flag "); + expect(attrs).toEqual({ "data-id": "x", "data-num": "3", "data-flag": "" }); + }); + + it("extracts section inner and wrapper", () => { + const html = '

Hi

'; + const result = extractSectionInner(html); + expect(result.inner).toBe("

Hi

"); + expect(result.wrapper).toEqual({ + tagName: "section", + attributes: { "data-chapter": "2", "data-epub-type": "chapter" }, + }); + }); + + it("returns original text when no section wrapper", () => { + const html = "

Hi

"; + const result = extractSectionInner(html); + expect(result.inner).toBe(html); + expect(result.wrapper).toBeNull(); + }); + + it("rebuilds section wrapper with attributes", () => { + const html = '

Hi

'; + const result = extractSectionInner(html); + expect(buildSectionWrapper(result.inner, result.wrapper)).toBe(html); + }); +}); diff --git a/apps/pipeline/src/tools/new-tooling/section-wrapper.ts b/apps/pipeline/src/tools/new-tooling/section-wrapper.ts new file mode 100644 index 00000000..447c42be --- /dev/null +++ b/apps/pipeline/src/tools/new-tooling/section-wrapper.ts @@ -0,0 +1,73 @@ +export type SectionWrapper = { tagName: string; attributes: Record }; + +export type SectionExtract = { inner: string; wrapper: SectionWrapper | null }; + +function isNameChar(char: string): boolean { + return /[A-Za-z0-9_:-]/.test(char); +} + +export function parseAttributes(raw: string): Record { + const attrs: Record = {}; + let i = 0; + + while (i < raw.length) { + while (i < raw.length && /\s/.test(raw[i])) i++; + if (i >= raw.length) break; + + let name = ""; + while (i < raw.length && isNameChar(raw[i])) { + name += raw[i]; + i += 1; + } + + if (!name) break; + + while (i < raw.length && /\s/.test(raw[i])) i++; + let value = ""; + + if (raw[i] === "=") { + i += 1; + while (i < raw.length && /\s/.test(raw[i])) i++; + + const quote = raw[i]; + if (quote === '"' || quote === "'") { + i += 1; + const start = i; + while (i < raw.length && raw[i] !== quote) i++; + value = raw.slice(start, i); + if (raw[i] === quote) i += 1; + } else { + const start = i; + while (i < raw.length && !/\s|>/.test(raw[i])) i++; + value = raw.slice(start, i); + } + } + + attrs[name] = value; + } + + return attrs; +} + +function buildAttributeString(attributes: Record): string { + const entries = Object.entries(attributes); + if (entries.length === 0) return ""; + return ( + " " + entries.map(([key, value]) => `${key}="${value.replace(/\\"/g, """)}"`).join(" ") + ); +} + +export function extractSectionInner(html: string): SectionExtract { + const match = html.match(/^\s*]*)>([\s\S]*)<\/section>\s*$/i); + if (!match) { + return { inner: html, wrapper: null }; + } + + const attributes = parseAttributes(match[1] ?? ""); + return { inner: match[2] ?? "", wrapper: { tagName: "section", attributes } }; +} + +export function buildSectionWrapper(inner: string, wrapper: SectionWrapper | null): string { + if (!wrapper) return inner; + return `<${wrapper.tagName}${buildAttributeString(wrapper.attributes)}>${inner}`; +} diff --git a/apps/pipeline/src/tools/new-tooling/summaryParagraphs.spec.ts b/apps/pipeline/src/tools/new-tooling/summaryParagraphs.spec.ts new file mode 100644 index 00000000..c262ef30 --- /dev/null +++ b/apps/pipeline/src/tools/new-tooling/summaryParagraphs.spec.ts @@ -0,0 +1,27 @@ +import { describe, it, expect } from "vitest"; +import { buildParagraphsForSummary } from "./summaryParagraphs"; + +describe("buildParagraphsForSummary", () => { + it("wraps each paragraph in a

with the data index", () => { + const output = buildParagraphsForSummary([ + { dataIndex: 0, text: "Hello world" }, + { dataIndex: 1, text: "Second paragraph" }, + ]); + + expect(output).toBe('

Hello world

\n

Second paragraph

'); + }); + + it("preserves double quotes in text content", () => { + const output = buildParagraphsForSummary([{ dataIndex: 2, text: 'He said "hello" and left.' }]); + + expect(output).toContain('He said "hello" and left.'); + }); + + it("keeps embedded HTML attributes double-quoted", () => { + const output = buildParagraphsForSummary([ + { dataIndex: 3, text: 'Mrs. Inglethorp\'s bedroom' }, + ]); + + expect(output).toContain('alt="Mrs. Inglethorp\'s bedroom"'); + }); +}); diff --git a/apps/pipeline/src/tools/new-tooling/summaryParagraphs.ts b/apps/pipeline/src/tools/new-tooling/summaryParagraphs.ts new file mode 100644 index 00000000..88a323b2 --- /dev/null +++ b/apps/pipeline/src/tools/new-tooling/summaryParagraphs.ts @@ -0,0 +1,5 @@ +export function buildParagraphsForSummary(paragraphs: { text: string; dataIndex: number }[]) { + return paragraphs + .map((paragraph) => `

${paragraph.text.trim()}

`) + .join("\n"); +} diff --git a/apps/pipeline/src/tools/se-converter/importSEBook.ts b/apps/pipeline/src/tools/se-converter/importSEBook.ts index fe36a2b2..95d1f582 100644 --- a/apps/pipeline/src/tools/se-converter/importSEBook.ts +++ b/apps/pipeline/src/tools/se-converter/importSEBook.ts @@ -14,10 +14,11 @@ import * as fs from "fs"; import * as path from "path"; +import { JSDOM } from "jsdom"; import { AdminConvexHttpClient } from "../../lib/AdminConvexHttpClient"; import { api } from "@bookgenius/convex/_generated/api"; import { convertSEBook, getSEBookImagesDir, type SEImageReference } from "./index"; -import { JSDOM } from "jsdom"; +import { computeParagraphCount } from "../../lib/paragraphCount"; const SE_BOOKS_DIR = path.resolve(__dirname, "../../../standardebooks-data/books"); const LEGACY_BOOKS_DIR = path.resolve(__dirname, "../../../../../books"); @@ -289,9 +290,7 @@ async function step3_ImportChapters( for (const chapter of chapters) { console.log(` Chapter ${chapter.chapterNumber}: ${chapter.title || "(no title)"}`); - const dom = new JSDOM(chapter.html); - const section = dom.window.document.querySelector("section"); - const paragraphCount = section?.children.length || 0; + const paragraphCount = computeParagraphCount(chapter.html); await client.action(api.chapterCompiler.uploadHtmlSourceChapter, { bookPath, diff --git a/apps/pipeline/tsconfig.json b/apps/pipeline/tsconfig.json index a017353a..b35939fb 100644 --- a/apps/pipeline/tsconfig.json +++ b/apps/pipeline/tsconfig.json @@ -1,7 +1,7 @@ { "compilerOptions": { "target": "ES2022", - "lib": ["ES2022"], + "lib": ["ES2022", "DOM"], "module": "ES2022", "moduleResolution": "bundler", "types": ["bun-types", "jest"], diff --git a/apps/player/src/components/modals/BookChaptersModal.tsx b/apps/player/src/components/modals/BookChaptersModal.tsx index ffc27cd5..56e471cd 100644 --- a/apps/player/src/components/modals/BookChaptersModal.tsx +++ b/apps/player/src/components/modals/BookChaptersModal.tsx @@ -3,7 +3,6 @@ import { useTranslation } from "react-i18next"; import ModalUI from "@player/components/modals/ModalUI"; import { systemNavigateTo } from "@player/helpers/paragraphsNavigation"; -import { getChapterTitle } from "@player/utils/getChapterTitle"; import { Button } from "../ui/button"; import { useBookConvex } from "@player/context/BookConvexContext"; import { useLocationRange } from "@player/hooks/useLocationRange"; @@ -34,12 +33,12 @@ const BookChaptersModal: React.FC = ({ onClose }) => { return bookData.chapters.map((chapter, index) => { return { id: parseInt(chapter.id), - title: getChapterTitle(parseInt(chapter.id), t), + title: chapter.title, page: (index + 1).toString(), isLocked: hasDemoAccess && parseInt(chapter.id, 10) > maxDemoChapter, }; }); - }, [t, hasDemoAccess, bookData]); + }, [hasDemoAccess, bookData]); const navigateToChapter = (chapterId: number) => { systemNavigateTo({ currentChapter: chapterId, currentParagraph: 0 }); diff --git a/apps/player/src/components/modals/CharacterModal.tsx b/apps/player/src/components/modals/CharacterModal.tsx index f48d5d5f..3e6f643f 100644 --- a/apps/player/src/components/modals/CharacterModal.tsx +++ b/apps/player/src/components/modals/CharacterModal.tsx @@ -11,10 +11,10 @@ import { getSavedLocation, systemNavigateTo } from "@player/helpers/paragraphsNa import { useBookConvex } from "@player/context/BookConvexContext"; import { highlightSearchInParagraph } from "@player/utils/textHighlighting"; import { DialogEnhanceClose } from "../ui/dialog"; -import { getChapterTitle } from "@player/utils/getChapterTitle"; import { resolveCharacterSnapshot } from "@player/utils/characterOverrides"; import { isVideoFile } from "@player/helpers/isVideoFile"; import { getAvatarSource } from "@player/helpers/svgAvatars"; +import { slugToDisplayName } from "@player/helpers/minorCharacterUtils"; import { useBottomInput } from "@player/stores/modals/bottomInput.store"; import { useSearchModal } from "@player/stores/modals/searchModal.store"; import { FILTER_OPTIONS } from "@player/utils/filterOptions"; @@ -51,12 +51,21 @@ const CharacterModal: React.FC = ({ const { setValue } = useBottomInput(); const { openModal: openSearchModal, setLastClickedAppearanceId, setResults } = useSearchModal(); const { pauseAllTimers, showAllElements } = useElementVisibilityStore(); - const { charactersData } = useBookConvex(); + const { charactersData, bookData } = useBookConvex(); + const chapterTitle = useMemo( + () => bookData!.chapters.find((c) => c.id === String(chapter))?.title, + [chapter, bookData], + ); const matchingCharacter = useMemo( () => charactersData.find((c) => c.slug === characterSlug), [characterSlug, charactersData], ); + // Generic avatar for unknown/minor characters (if available) + const genericCharacter = useMemo( + () => charactersData.find((c) => c.slug === "generic-avatar"), + [charactersData], + ); const latestSummary = useMemo( () => (matchingCharacter ? findLatestSummaryInRange(matchingCharacter, endChapter) : ""), [matchingCharacter, endChapter], @@ -176,7 +185,45 @@ const CharacterModal: React.FC = ({ return option ? t(option.translationKey) : type; }; - if (!matchingCharacter) return null; + // Handle unknown characters (speakers not in characterMetadata) + if (!matchingCharacter) { + const displayName = slugToDisplayName(characterSlug); + // Use generic avatar from Convex if available, otherwise SVG fallback + const unknownAvatarSrc = + genericCharacter?.media?.avatarUrl ?? + getAvatarSource({ + slug: characterSlug, + characterName: displayName, + bookSlug: "", + infoPerChapter: [], + }); + + return ( + + +
+ {displayName} +
+

{displayName}

+ { + e.preventDefault(); + e.stopPropagation(); + handleOnClose(); + }} + /> +
+
+ ); + } if (!resolvedMediaSrc) { console.error("no resolved media src for character modal", matchingCharacter); @@ -294,8 +341,7 @@ const CharacterModal: React.FC = ({ - {appearance.percentInChapter}% {t("of_chapter")}{" "} - {getChapterTitle(appearance.chapter, t)} + {appearance.percentInChapter}% {t("of_chapter")} {chapterTitle} diff --git a/apps/player/src/components/modals/SearchModal.tsx b/apps/player/src/components/modals/SearchModal.tsx index 5eb75cf1..a2fb499f 100644 --- a/apps/player/src/components/modals/SearchModal.tsx +++ b/apps/player/src/components/modals/SearchModal.tsx @@ -8,7 +8,6 @@ import React, { useRef, } from "react"; import { useTranslation } from "react-i18next"; -import type { TFunction } from "i18next"; import { motion } from "motion/react"; import { Search, FileText, Minimize2, Maximize2, X } from "lucide-react"; @@ -29,10 +28,10 @@ import { AccordionTrigger, AccordionContent, } from "@player/components/ui/accordion"; -import { getChapterTitle } from "@player/utils/getChapterTitle"; import { cn } from "@player/lib/utils"; import { findScrollParent } from "@player/utils/findScrollParent"; import { FILTER_OPTIONS, FILTER_VALUE_MAP, type SearchFilter } from "@player/utils/filterOptions"; +import { useBookConvex } from "@player/context/BookConvexContext"; interface SearchModalProps { onClose: () => void; @@ -51,6 +50,7 @@ export const SearchModal: React.FC = ({ // eslint-disable-next-line complexity -- search UI with filtering, loading states, and result rendering }) => { const { t } = useTranslation(); + const { bookData } = useBookConvex(); const deferredResults = useDeferredValue(searchResults); @@ -368,8 +368,10 @@ export const SearchModal: React.FC = ({ c.id === String(chapter))?.title ?? "" + } items={items} - t={t} clickedAppearanceId={clickedAppearanceId} searchQuery={searchQuery} /> @@ -457,28 +459,27 @@ export const SearchModal: React.FC = ({ const ChapterGroup = memo(function ChapterGroup({ chapter, + chapterTitle, items, - t, clickedAppearanceId, searchQuery, }: { chapter: number; + chapterTitle: string; items: SearchResultItemData[]; - t: TFunction; clickedAppearanceId?: string; searchQuery?: string; }) { - const chapterTitle = useMemo( + const chapterTitleElement = useMemo( () => (
- {getChapterTitle(Number(chapter), t)} ({items.length}{" "} - {items.length === 1 ? "result" : "results"}) + {chapterTitle} ({items.length} {items.length === 1 ? "result" : "results"})
), - [chapter, items.length, t], + [chapterTitle, items.length], ); return ( @@ -487,7 +488,7 @@ const ChapterGroup = memo(function ChapterGroup({ className="border-book-primary-20 rounded-lg mb-3 overflow-hidden" > - {chapterTitle} + {chapterTitleElement}
diff --git a/apps/player/src/helpers/minorCharacterUtils.ts b/apps/player/src/helpers/minorCharacterUtils.ts new file mode 100644 index 00000000..b1e13229 --- /dev/null +++ b/apps/player/src/helpers/minorCharacterUtils.ts @@ -0,0 +1,24 @@ +/** + * Utilities for handling unknown/minor characters that are not in characterMetadata. + * Unknown characters are identified by speakers that have a data-speaker attribute + * but don't match any known character slug in the book's character list. + */ + +/** + * Check if a character slug is for an unknown/minor character + * by checking if it exists in the set of known character slugs. + */ +export function isUnknownCharacter(slug: string, knownSlugs: Set): boolean { + return !knownSlugs.has(slug); +} + +/** + * Convert a slug to a human-readable display name. + * "tall-soldier-at-gate" -> "Tall Soldier At Gate" + */ +export function slugToDisplayName(slug: string): string { + return slug + .split("-") + .map((word) => word.charAt(0).toUpperCase() + word.slice(1)) + .join(" "); +} diff --git a/apps/player/src/locales/en/translation.json b/apps/player/src/locales/en/translation.json index 135a1831..fe724cde 100644 --- a/apps/player/src/locales/en/translation.json +++ b/apps/player/src/locales/en/translation.json @@ -237,7 +237,7 @@ "chapters": "Chapters", "paragraph": "Paragraph", "chapter_percent": "Chapter Percent", - "of_chapter": "of Chapter", + "of_chapter": "of", "book_settings": "Book Settings", "open_chapter": "Open Chapter", "back_to_platform": "Back to Platform", diff --git a/apps/player/src/locales/pl/translation.json b/apps/player/src/locales/pl/translation.json index 2e16f837..b1e3b3a6 100644 --- a/apps/player/src/locales/pl/translation.json +++ b/apps/player/src/locales/pl/translation.json @@ -255,7 +255,7 @@ "chapters": "Rozdziały", "paragraph": "Paragraf", "chapter_percent": "Procent w Rozdziale", - "of_chapter": "rozdziału", + "of_chapter": "w", "book_settings": "Ustawienia Książki", "open_chapter": "Otwórz Rozdział", "back_to_platform": "Powrót do Platformy", diff --git a/apps/player/src/services/__tests__/formatB.test.ts b/apps/player/src/services/__tests__/formatB.test.ts index cdf9bf88..2e5152a9 100644 --- a/apps/player/src/services/__tests__/formatB.test.ts +++ b/apps/player/src/services/__tests__/formatB.test.ts @@ -180,6 +180,26 @@ describe("Format B", () => { // Pure em paragraphs should become didaskalia expect(result).toContain('data-is-didaskalia="true"'); }); + + it("marks pure em paragraphs inside format B speaker blocks as didaskalia", () => { + const input = `
+
+

To know my deed, 'twere best not know myself.

+

Knocking within

+

Wake Duncan with thy knocking! I would thou couldst!

+
+
`; + + const result = normalizeChapterHtml(input); + const parser = new DOMParser(); + const doc = parser.parseFromString(result, "text/html"); + + const didaskalia = Array.from(doc.querySelectorAll(".character-text p")).find((p) => + p.textContent?.includes("Knocking within"), + ); + + expect(didaskalia?.getAttribute("data-is-didaskalia")).toBe("true"); + }); }); describe("data-index injection", () => { diff --git a/apps/player/src/services/__tests__/paragraphCount.test.ts b/apps/player/src/services/__tests__/paragraphCount.test.ts new file mode 100644 index 00000000..6d04af74 --- /dev/null +++ b/apps/player/src/services/__tests__/paragraphCount.test.ts @@ -0,0 +1,26 @@ +/** + * @vitest-environment jsdom + */ +import { describe, it, expect } from "vitest"; +import { countParagraphsFromChapterHtml } from "../htmlNormalizer"; + +describe("countParagraphsFromChapterHtml", () => { + it("counts data-index in compiled HTML", () => { + const html = + '

A

B

C

'; + expect(countParagraphsFromChapterHtml(html)).toBe(3); + }); + + it("counts normalized prose children", () => { + const html = '

Title

One

Two

'; + expect(countParagraphsFromChapterHtml(html)).toBe(3); + }); + + it("accounts for play rows in poemProse render mode", () => { + const html = `
+

Hello

+

World

+
`; + expect(countParagraphsFromChapterHtml(html, { renderMode: "poemProse" })).toBe(4); + }); +}); diff --git a/apps/player/src/services/htmlNormalizer.ts b/apps/player/src/services/htmlNormalizer.ts index 890e7a35..e42b0146 100644 --- a/apps/player/src/services/htmlNormalizer.ts +++ b/apps/player/src/services/htmlNormalizer.ts @@ -188,10 +188,14 @@ function transformFormatBToPlayRows(section: Element, doc: Document): void { // Move content paragraphs for (const innerChild of Array.from(child.children)) { + const isExplicitDidaskalia = innerChild.getAttribute("data-is-didaskalia") === "true"; + const isPureEm = + innerChild.tagName.toLowerCase() === "p" ? isPureEmParagraph(innerChild) : false; + const isDidaskalia = isExplicitDidaskalia || isPureEm; const p = innerChild.cloneNode(true) as Element; p.setAttribute("data-text-alignment", state.alignment); p.setAttribute("data-is-character", "false"); - p.setAttribute("data-is-didaskalia", "false"); + p.setAttribute("data-is-didaskalia", isDidaskalia ? "true" : "false"); characterText.appendChild(p); } @@ -349,6 +353,8 @@ export type RenderMode = "default" | "enhancedProse" | "poemProse"; export type EnhancedProseOptions = { speakerDisplayNames?: Map }; +export type ParagraphCountOptions = { renderMode?: RenderMode; bookForm?: string | null }; + function createPlayRowFromSpeakerGroup( paragraphs: Element[], doc: Document, @@ -761,6 +767,47 @@ export function normalizeBookHtml(html: string): string { return doc.body.innerHTML; } +function countDataIndexFromHtml(html: string): number { + const parser = new DOMParser(); + const doc = parser.parseFromString(html, "text/html"); + return doc.querySelectorAll("[data-index]").length; +} + +/** + * Count paragraphs the same way the player indexes them (via data-index). + * This ensures calculateReadProgress stays accurate without loading all chapters client-side. + */ +export function countParagraphsFromChapterHtml( + html: string, + options: ParagraphCountOptions = {}, +): number { + if (!html.trim()) { + return 0; + } + + if (typeof DOMParser === "undefined") { + throw new Error("DOMParser is not available. Provide a DOMParser implementation first."); + } + + let normalizedHtml = html; + if (detectSourceFormat(html) === "source") { + const renderMode = options.renderMode ?? "default"; + const bookForm = options.bookForm?.toLowerCase() ?? ""; + const useEnhancedProse = renderMode === "enhancedProse" && bookForm !== "play"; + const usePoemProse = renderMode === "poemProse"; + + if (usePoemProse) { + normalizedHtml = normalizeChapterHtmlPoemProse(html); + } else if (useEnhancedProse) { + normalizedHtml = normalizeChapterHtmlEnhanced(html); + } else { + normalizedHtml = normalizeChapterHtml(html); + } + } + + return countDataIndexFromHtml(normalizedHtml); +} + export interface CharacterOccurrence { slug: string; chapter: number; diff --git a/apps/player/src/utils/getChapterTitle.ts b/apps/player/src/utils/getChapterTitle.ts deleted file mode 100644 index f5bcfce9..00000000 --- a/apps/player/src/utils/getChapterTitle.ts +++ /dev/null @@ -1,107 +0,0 @@ -import { getBookData } from "@player/state/bookDataStore"; -import { isNumberTitle } from "./isNumberTitle"; - -export const getTitle = (chapter: number, t: (key: string) => string) => { - // Special case for 0 - if (chapter === 0) return t("chapter_zero"); - - // Units (1-9) - const units = [ - "", - t("ordinal.1"), - t("ordinal.2"), - t("ordinal.3"), - t("ordinal.4"), - t("ordinal.5"), - t("ordinal.6"), - t("ordinal.7"), - t("ordinal.8"), - t("ordinal.9"), - ]; - - // Teens (11-19) - const teens = [ - t("ordinal.10"), - t("ordinal.11"), - t("ordinal.12"), - t("ordinal.13"), - t("ordinal.14"), - t("ordinal.15"), - t("ordinal.16"), - t("ordinal.17"), - t("ordinal.18"), - t("ordinal.19"), - ]; - - // Tens (10, 20, 30, etc.) - const tens = [ - "", - t("ordinal.10"), - t("ordinal.20"), - t("ordinal.30"), - t("ordinal.40"), - t("ordinal.50"), - t("ordinal.60"), - t("ordinal.70"), - t("ordinal.80"), - t("ordinal.90"), - ]; - - // Hundreds (100, 200, etc.) - in case they're needed for very large books - const hundreds = [ - "", - t("ordinal.100"), - t("ordinal.200"), - t("ordinal.300"), - t("ordinal.400"), - t("ordinal.500"), - t("ordinal.600"), - t("ordinal.700"), - t("ordinal.800"), - t("ordinal.900"), - ]; - - let chapterName = ""; - - if (chapter >= 100) { - const hundred = Math.floor(chapter / 100); - chapterName += hundreds[hundred] + " "; - chapter %= 100; - } - - if (chapter >= 20) { - const ten = Math.floor(chapter / 10); - const unit = chapter % 10; - chapterName += tens[ten]; - if (unit > 0) { - chapterName += " " + units[unit]; - } - } else if (chapter >= 10) { - chapterName += teens[chapter - 10]; - } else { - chapterName += units[chapter]; - } - - return `${t("chapter")} ${chapterName.trim()}`; -}; - -/** - * Get the display title for a chapter, preferring custom titles over ordinal numbers - * @param chapterNumber - The chapter number (1-based) - * @param t - Translation function - * @returns The chapter title to display - */ -export const getChapterTitle = (chapterNumber: number, t: (key: string) => string): string => { - const bookData = getBookData(); - - if (!bookData?.chapters) { - return `${t("chapter")} ${chapterNumber}`; - } - - const chapter = bookData.chapters.find((ch) => parseInt(ch.id) === chapterNumber); - if (chapter && chapter.title.trim() && !isNumberTitle(chapter.title)) { - return chapter.title; - } - - return getTitle(chapterNumber, t); -}; diff --git a/bun.lock b/bun.lock index 7b8424b2..b0cbbb92 100644 --- a/bun.lock +++ b/bun.lock @@ -153,6 +153,7 @@ "version": "1.0.0", "dependencies": { "@ai-sdk/anthropic": "2.0.38", + "@ai-sdk/azure": "2.0.91", "@ai-sdk/cerebras": "^1.0.20", "@ai-sdk/google": "^2.0.14", "@ai-sdk/groq": "^2.0.21", @@ -625,6 +626,8 @@ "@ai-sdk/anthropic": ["@ai-sdk/anthropic@2.0.38", "", { "dependencies": { "@ai-sdk/provider": "2.0.0", "@ai-sdk/provider-utils": "3.0.13" }, "peerDependencies": { "zod": "^3.25.76 || ^4.1.8" } }, "sha512-NjU1ftHbu90OfRCgBwfFelmdEXwGFwLEcfyOyyfjRDm8QHaJUlPNnXhdhPTYuUU386yhj29Vibemiaq6jQv3lA=="], + "@ai-sdk/azure": ["@ai-sdk/azure@2.0.91", "", { "dependencies": { "@ai-sdk/openai": "2.0.89", "@ai-sdk/provider": "2.0.1", "@ai-sdk/provider-utils": "3.0.20" }, "peerDependencies": { "zod": "^3.25.76 || ^4.1.8" } }, "sha512-9tznVSs6LGQNKKxb8pKd7CkBV9yk+a/ENpFicHCj2CmBUKefxzwJ9JbUqrlK3VF6dGZw3LXq0dWxt7/Yekaj1w=="], + "@ai-sdk/cerebras": ["@ai-sdk/cerebras@1.0.34", "", { "dependencies": { "@ai-sdk/openai-compatible": "1.0.30", "@ai-sdk/provider": "2.0.1", "@ai-sdk/provider-utils": "3.0.20" }, "peerDependencies": { "zod": "^3.25.76 || ^4.1.8" } }, "sha512-XOK0dJsAGoPYi/lfR4KFBi8xhvJ46oCpAxUD6FmJAuJ4eh0qlj5zDt+myvzM8gvN7S6K7zHD+mdWlOPKGQT8Vg=="], "@ai-sdk/gateway": ["@ai-sdk/gateway@2.0.2", "", { "dependencies": { "@ai-sdk/provider": "2.0.0", "@ai-sdk/provider-utils": "3.0.13", "@vercel/oidc": "3.0.3" }, "peerDependencies": { "zod": "^3.25.76 || ^4.1.8" } }, "sha512-25F1qPqZxOw9IcV9OQCL29hV4HAFLw5bFWlzQLBi5aDhEZsTMT2rMi3umSqNaUxrrw+dLRtjOL7RbHC+WjbA/A=="], @@ -5225,6 +5228,8 @@ "@ai-sdk/anthropic/@ai-sdk/provider": ["@ai-sdk/provider@2.0.0", "", { "dependencies": { "json-schema": "^0.4.0" } }, "sha512-6o7Y2SeO9vFKB8lArHXehNuusnpddKPk7xqL7T2/b+OvXMRIXUO1rR4wcv1hAFUAT9avGZshty3Wlua/XA7TvA=="], + "@ai-sdk/azure/@ai-sdk/provider-utils": ["@ai-sdk/provider-utils@3.0.20", "", { "dependencies": { "@ai-sdk/provider": "2.0.1", "@standard-schema/spec": "^1.0.0", "eventsource-parser": "^3.0.6" }, "peerDependencies": { "zod": "^3.25.76 || ^4.1.8" } }, "sha512-iXHVe0apM2zUEzauqJwqmpC37A5rihrStAih5Ks+JE32iTe4LZ58y17UGBjpQQTCRw9YxMeo2UFLxLpBluyvLQ=="], + "@ai-sdk/cerebras/@ai-sdk/provider-utils": ["@ai-sdk/provider-utils@3.0.20", "", { "dependencies": { "@ai-sdk/provider": "2.0.1", "@standard-schema/spec": "^1.0.0", "eventsource-parser": "^3.0.6" }, "peerDependencies": { "zod": "^3.25.76 || ^4.1.8" } }, "sha512-iXHVe0apM2zUEzauqJwqmpC37A5rihrStAih5Ks+JE32iTe4LZ58y17UGBjpQQTCRw9YxMeo2UFLxLpBluyvLQ=="], "@ai-sdk/gateway/@ai-sdk/provider": ["@ai-sdk/provider@2.0.0", "", { "dependencies": { "json-schema": "^0.4.0" } }, "sha512-6o7Y2SeO9vFKB8lArHXehNuusnpddKPk7xqL7T2/b+OvXMRIXUO1rR4wcv1hAFUAT9avGZshty3Wlua/XA7TvA=="], From 03d4d5c3c5d7bd1c08046f50c0cf6e12ef268608 Mon Sep 17 00:00:00 2001 From: Lukasz Gandecki Date: Thu, 5 Feb 2026 10:57:59 +0100 Subject: [PATCH 2/4] adding the wrapping/restoring unwrapped --- ...nwrapped-paragraphs-in-temporary-output.ts | 4 +- .../ensure-section-wrapper.spec.ts | 19 ++++ .../new-tooling/ensure-section-wrapper.ts | 9 ++ .../restore-unwrapped-lines.spec.ts | 36 +++++++ .../new-tooling/restore-unwrapped-lines.ts | 93 +++++++++++++++++++ 5 files changed, 159 insertions(+), 2 deletions(-) create mode 100644 apps/pipeline/src/tools/new-tooling/ensure-section-wrapper.spec.ts create mode 100644 apps/pipeline/src/tools/new-tooling/ensure-section-wrapper.ts create mode 100644 apps/pipeline/src/tools/new-tooling/restore-unwrapped-lines.spec.ts create mode 100644 apps/pipeline/src/tools/new-tooling/restore-unwrapped-lines.ts diff --git a/apps/pipeline/src/scripts/fix-unwrapped-paragraphs-in-temporary-output.ts b/apps/pipeline/src/scripts/fix-unwrapped-paragraphs-in-temporary-output.ts index fe85f39a..5e7ddca2 100644 --- a/apps/pipeline/src/scripts/fix-unwrapped-paragraphs-in-temporary-output.ts +++ b/apps/pipeline/src/scripts/fix-unwrapped-paragraphs-in-temporary-output.ts @@ -1,7 +1,7 @@ #!/usr/bin/env bun import fs from "fs"; import path from "path"; -import { restoreUnwrappedBlocks } from "../tools/new-tooling/restore-unwrapped-blocks"; +import { restoreUnwrappedLines } from "../tools/new-tooling/restore-unwrapped-lines"; import { buildSectionWrapper, extractSectionInner } from "../tools/new-tooling/section-wrapper"; const DEFAULT_OUTPUT_ROOT = @@ -110,7 +110,7 @@ function main() { const modelExtract = extractSectionInner(modelRaw); const originalExtract = extractSectionInner(originalRaw); - const fixedInner = restoreUnwrappedBlocks(originalExtract.inner, modelExtract.inner); + const fixedInner = restoreUnwrappedLines(originalExtract.inner, modelExtract.inner); const hasChanges = fixedInner !== modelExtract.inner; const output = hasChanges ? buildSectionWrapper(fixedInner, modelExtract.wrapper) : modelRaw; diff --git a/apps/pipeline/src/tools/new-tooling/ensure-section-wrapper.spec.ts b/apps/pipeline/src/tools/new-tooling/ensure-section-wrapper.spec.ts new file mode 100644 index 00000000..dfaf82fa --- /dev/null +++ b/apps/pipeline/src/tools/new-tooling/ensure-section-wrapper.spec.ts @@ -0,0 +1,19 @@ +import { describe, expect, it } from "vitest"; +import { ensureSectionWrapper } from "./ensure-section-wrapper"; + +describe("ensureSectionWrapper", () => { + it("passes through valid section wrappers", () => { + const input = '

Hi

'; + expect(ensureSectionWrapper(input)).toBe(input); + }); + + it("accepts additional attributes", () => { + const input = '

Hi

'; + expect(ensureSectionWrapper(input)).toBe(input); + }); + + it("throws when section wrapper is missing", () => { + const input = "

Hi

"; + expect(() => ensureSectionWrapper(input)).toThrow("Missing
wrapper"); + }); +}); diff --git a/apps/pipeline/src/tools/new-tooling/ensure-section-wrapper.ts b/apps/pipeline/src/tools/new-tooling/ensure-section-wrapper.ts new file mode 100644 index 00000000..ae943345 --- /dev/null +++ b/apps/pipeline/src/tools/new-tooling/ensure-section-wrapper.ts @@ -0,0 +1,9 @@ +export function ensureSectionWrapper(html: string): string { + const match = html.match( + /^\s*]*data-chapter\s*=\s*['"]?\d+['"]?[^>]*>[\s\S]*<\/section>\s*$/i, + ); + if (!match) { + throw new Error("Missing
wrapper"); + } + return html; +} diff --git a/apps/pipeline/src/tools/new-tooling/restore-unwrapped-lines.spec.ts b/apps/pipeline/src/tools/new-tooling/restore-unwrapped-lines.spec.ts new file mode 100644 index 00000000..e8326e41 --- /dev/null +++ b/apps/pipeline/src/tools/new-tooling/restore-unwrapped-lines.spec.ts @@ -0,0 +1,36 @@ +import { describe, expect, it } from "vitest"; +import { restoreUnwrappedLines } from "./restore-unwrapped-lines"; + +describe("restoreUnwrappedLines", () => { + it("wraps a bare text line using the original

wrapper", () => { + const original = [ + "

One.

", + '

Two Evelyn.

', + "

Miss Howard nodded grimly.

", + ].join("\n"); + + const model = [ + "

One.

", + '

Two Evelyn.

', + "Miss Howard nodded grimly.", + ].join("\n"); + + const expected = [ + "

One.

", + '

Two Evelyn.

', + "

Miss Howard nodded grimly.

", + ].join("\n"); + + expect(restoreUnwrappedLines(original, model)).toBe(expected); + }); + + it("preserves indentation when wrapping", () => { + const original = ["

Intro.

", "

Indented line.

"].join("\n"); + + const model = ["

Intro.

", " Indented line."].join("\n"); + + const expected = ["

Intro.

", "

Indented line.

"].join("\n"); + + expect(restoreUnwrappedLines(original, model)).toBe(expected); + }); +}); diff --git a/apps/pipeline/src/tools/new-tooling/restore-unwrapped-lines.ts b/apps/pipeline/src/tools/new-tooling/restore-unwrapped-lines.ts new file mode 100644 index 00000000..30a94a1d --- /dev/null +++ b/apps/pipeline/src/tools/new-tooling/restore-unwrapped-lines.ts @@ -0,0 +1,93 @@ +import { DOMParser } from "@xmldom/xmldom"; + +type OriginalBlock = { openTag: string; closeTag: string; normalizedText: string }; + +function normalizeText(text: string): string { + return text.replace(/\s+/g, " ").trim(); +} + +function getTextContent(html: string, parser: DOMParser): string { + const doc = parser.parseFromString(`

${html}

`, "text/html"); + const p = doc.getElementsByTagName("p")[0]; + return p?.textContent ?? ""; +} + +function extractOriginalBlocks(originalInner: string): OriginalBlock[] { + const parser = new DOMParser(); + const blocks: OriginalBlock[] = []; + const lines = originalInner.split(/\r?\n/); + + for (const line of lines) { + const trimmed = line.trim(); + if (!trimmed.startsWith(""); + const closeStart = trimmed.lastIndexOf("

"); + if (openEnd === -1 || closeStart === -1 || closeStart <= openEnd) continue; + + const openTag = trimmed.slice(0, openEnd + 1); + const innerHtml = trimmed.slice(openEnd + 1, closeStart); + const normalizedText = normalizeText(getTextContent(innerHtml, parser)); + + blocks.push({ openTag, closeTag: "

", normalizedText }); + } + + return blocks; +} + +function findMatchingIndex( + blocks: OriginalBlock[], + startIndex: number, + normalizedText: string, +): number { + if (!normalizedText) return -1; + for (let i = startIndex; i < blocks.length; i += 1) { + if (blocks[i].normalizedText === normalizedText) return i; + } + return -1; +} + +export function restoreUnwrappedLines(originalInner: string, modelInner: string): string { + if (originalInner === modelInner) return modelInner; + + const blocks = extractOriginalBlocks(originalInner); + if (blocks.length === 0) return modelInner; + + const lines = modelInner.split(/\r?\n/); + const output: string[] = []; + let changed = false; + let originalIndex = 0; + + for (const line of lines) { + const trimmed = line.trim(); + + if (!trimmed) { + output.push(line); + continue; + } + + if (trimmed.startsWith("<")) { + output.push(line); + continue; + } + + const normalizedText = normalizeText(trimmed); + const matchIndex = findMatchingIndex(blocks, originalIndex, normalizedText); + + if (matchIndex >= 0) { + const indentMatch = line.match(/^\s*/); + const indent = indentMatch ? indentMatch[0] : ""; + const block = blocks[matchIndex]; + output.push(`${indent}${block.openTag}${trimmed}${block.closeTag}`); + originalIndex = matchIndex + 1; + changed = true; + continue; + } + + output.push(line); + } + + const joined = output.join("\n"); + if (!changed) return modelInner; + return modelInner.endsWith("\n") ? `${joined}\n` : joined; +} From 096e5cd9064908f3bc2f3821b8557f616548f0ba Mon Sep 17 00:00:00 2001 From: Lukasz Gandecki Date: Thu, 5 Feb 2026 14:09:04 +0100 Subject: [PATCH 3/4] pipeline: free-run mode, raw LLM capture, generic avatar fallback, SE search --- .../src/pages/StandardEbooksPage.tsx | 89 ++++++++++++++++--- apps/pipeline/src/server/pipeline.ts | 42 ++++++++- apps/pipeline/src/server/router.ts | 12 +++ apps/pipeline/src/server/style-selection.ts | 5 +- .../identifyEntityAndRewriteParagraphs.ts | 36 ++++++-- .../new-tooling/generate-flux-schnel-image.ts | 2 +- .../generate-pictures-for-entities.ts | 20 ++++- .../generate-prompts-for-backgrounds.ts | 5 ++ .../get-chapter-by-chapter-summary.ts | 2 +- ...by-chapter-with-paragraphs-json-summary.ts | 2 +- .../sanitize-nested-paragraphs.spec.ts | 24 +++++ .../new-tooling/sanitize-nested-paragraphs.ts | 50 +++++++++++ apps/player/src/context/BookConvexContext.tsx | 1 + apps/player/src/ui/activateMediaInRange.ts | 40 ++++++++- convex/lib/characterDataV2.ts | 10 +++ 15 files changed, 310 insertions(+), 30 deletions(-) create mode 100644 apps/pipeline/src/tools/new-tooling/sanitize-nested-paragraphs.spec.ts create mode 100644 apps/pipeline/src/tools/new-tooling/sanitize-nested-paragraphs.ts diff --git a/apps/pipeline-ui/src/pages/StandardEbooksPage.tsx b/apps/pipeline-ui/src/pages/StandardEbooksPage.tsx index 6717d5df..e9b8f891 100644 --- a/apps/pipeline-ui/src/pages/StandardEbooksPage.tsx +++ b/apps/pipeline-ui/src/pages/StandardEbooksPage.tsx @@ -1,7 +1,7 @@ import { useState, useEffect, useCallback, useRef, useLayoutEffect } from "react"; import { useNavigate } from "react-router-dom"; import { trpc } from "../trpc"; -import { Loader2, ArrowLeft, BookOpen } from "lucide-react"; +import { Loader2, ArrowLeft, BookOpen, Search, X } from "lucide-react"; import { BookCard, type CollectionBook } from "../components/BookCard"; import { BookModal } from "../components/BookModal"; import { Button } from "@/components/ui/button"; @@ -136,15 +136,18 @@ function AuthorLetterRow({ export function StandardEbooksPage() { const navigate = useNavigate(); const [groupedBooks, setGroupedBooks] = useState>({}); + const [allBooks, setAllBooks] = useState([]); const [isLoading, setIsLoading] = useState(true); const [totalBooks, setTotalBooks] = useState(0); const [modalBook, setModalBook] = useState(null); + const [searchQuery, setSearchQuery] = useState(""); useEffect(() => { const loadData = async () => { try { const data = await trpc.getStandardEbooksIndex.query(); setGroupedBooks(data.groupedByAuthorLetter); + setAllBooks(data.books); setTotalBooks(data.books.length); } catch (e) { console.error("Failed to load Standard Ebooks index:", e); @@ -178,7 +181,33 @@ export function StandardEbooksPage() { ); } - const sortedLetters = Object.keys(groupedBooks).sort(); + const normalizedQuery = searchQuery.trim().toLowerCase(); + const filteredBooks = normalizedQuery + ? allBooks.filter((book) => { + const haystack = [ + book.title, + book.author, + book.authorFileAs, + book.description, + book.subjects.join(" "), + ] + .join(" ") + .toLowerCase(); + return haystack.includes(normalizedQuery); + }) + : allBooks; + + const visibleGroupedBooks = normalizedQuery + ? filteredBooks.reduce>((acc, book) => { + const firstLetter = (book.authorFileAs || book.author).charAt(0).toUpperCase(); + if (!acc[firstLetter]) acc[firstLetter] = []; + acc[firstLetter].push(book); + return acc; + }, {}) + : groupedBooks; + + const sortedLetters = Object.keys(visibleGroupedBooks).sort(); + const visibleCount = normalizedQuery ? filteredBooks.length : totalBooks; return (
@@ -189,7 +218,7 @@ export function StandardEbooksPage() { Standard Ebooks

- {totalBooks} professionally formatted public domain books + {visibleCount} professionally formatted public domain books

-
- {sortedLetters.map((letter, idx) => ( -
- +
+
+ + setSearchQuery(e.target.value)} + placeholder="Search by title, author, or subject..." + className="w-full pl-10 pr-10 py-2 rounded-md bg-background border border-border text-foreground placeholder:text-muted-foreground focus:outline-none focus:ring-2 focus:ring-primary/50" /> + {searchQuery.trim() && ( + + )}
- ))} +
+ {normalizedQuery + ? `Showing ${visibleCount} of ${totalBooks} books` + : `${totalBooks} books`} +
+
+
+ +
+ {sortedLetters.length === 0 ? ( +
+ No books match your search. +
+ ) : ( + sortedLetters.map((letter, idx) => ( +
+ +
+ )) + )}
diff --git a/apps/pipeline/src/server/pipeline.ts b/apps/pipeline/src/server/pipeline.ts index ed1a7cbc..f3ed0054 100644 --- a/apps/pipeline/src/server/pipeline.ts +++ b/apps/pipeline/src/server/pipeline.ts @@ -258,8 +258,16 @@ async function uploadCharactersToConvex( } } + console.log( + "[uploadCharactersToConvex] Characters to process:", + referenceCards.characters.map((c) => c.name), + ); + for (const character of referenceCards.characters) { const characterSlug = generateTagName(character.name).toLowerCase(); + console.log( + `[uploadCharactersToConvex] Processing character: "${character.name}" -> slug: "${characterSlug}"`, + ); const promptEntry = generatedPrompts.characters.find( (p) => generateTagName(p.name).toLowerCase() === characterSlug, ); @@ -276,7 +284,11 @@ async function uploadCharactersToConvex( const avatarExtensions = [".png", ".jpg", ".jpeg", ".webp"]; for (const ext of avatarExtensions) { const avatarPath = path.join(outputDir, "characters", `${characterSlug}${ext}`); - if (fs.existsSync(avatarPath)) { + const fileExists = fs.existsSync(avatarPath); + console.log( + `[uploadCharactersToConvex] Checking avatar: ${avatarPath} - exists: ${fileExists}`, + ); + if (fileExists) { addLog(job, `Uploading avatar for ${character.name}...`); try { const content = fs.readFileSync(avatarPath); @@ -651,6 +663,28 @@ export async function startPipeline(input: { } else { autoStyle = await createGraphicalStyle(slug, { saveToFile: false }); } + + const isFreeRun = process.env.FREE_RUN === "true"; + if (isFreeRun) { + const FREE_RUN_AVATAR_STYLE = + "Abstract geometric avatar Bauhaus style, simple shapes, limited color palette. Natural look, flat shade."; + const forcedStyle = { ...autoStyle, avatarStyle: FREE_RUN_AVATAR_STYLE }; + + setAutoStyleComplete(bookRoot, forcedStyle); + setStyleChoice(bookRoot, "auto"); + + writeBookFile( + "graphicalStyle.json", + JSON.stringify(forcedStyle, null, 2), + FILE_TYPE.TEMPORARY, + ); + addLog(job, "FREE_RUN enabled - skipping style selection and previews"); + + styleSelectionCallbacks.delete(job.id); + await uploadGraphicalStyleToConvex(job, tempOutputDir); + return; + } + setAutoStyleComplete(bookRoot, autoStyle); addLog(job, "Auto style generated, awaiting user input"); @@ -739,6 +773,10 @@ export async function startPipeline(input: { }, generate_backgrounds: async () => { + if (process.env.FREE_RUN === "true") { + addLog(job, "FREE_RUN enabled - skipping background generation and upload"); + return; + } setBookArg(slug); await generateBackgrounds({}); await uploadBackgroundsToConvex(job, outputDir); @@ -830,7 +868,7 @@ export async function startPipeline(input: { (step) => step !== "complete" && step !== "failed", ); - if (process.env.QUICK_MODE === "true") { + if (process.env.QUICK_MODE === "true" || process.env.FREE_RUN === "true") { const skipSteps: Step[] = [ "make_chapter_summaries", "map_summaries_to_paragraphs", diff --git a/apps/pipeline/src/server/router.ts b/apps/pipeline/src/server/router.ts index cbbc036b..2d75b0e8 100644 --- a/apps/pipeline/src/server/router.ts +++ b/apps/pipeline/src/server/router.ts @@ -603,6 +603,10 @@ export const appRouter = router({ submitStyleDescription: procedure .input(z.object({ jobId: z.string(), description: z.string().nullable() })) .mutation(async ({ input }) => { + if (process.env.FREE_RUN === "true") { + throw new Error("Style selection disabled when FREE_RUN=true"); + } + const job = jobs.get(input.jobId); if (!job) throw new Error("Job not found"); @@ -647,6 +651,10 @@ export const appRouter = router({ chooseStyle: procedure .input(z.object({ jobId: z.string(), choice: z.enum(["auto", "user"]) })) .mutation(async ({ input }) => { + if (process.env.FREE_RUN === "true") { + throw new Error("Style selection disabled when FREE_RUN=true"); + } + const job = jobs.get(input.jobId); if (!job) throw new Error("Job not found"); @@ -677,6 +685,10 @@ export const appRouter = router({ .input(z.object({ jobId: z.string() })) // eslint-disable-next-line complexity -- multiple file operations and style processing steps .mutation(async ({ input }) => { + if (process.env.FREE_RUN === "true") { + throw new Error("Style previews disabled when FREE_RUN=true"); + } + const job = jobs.get(input.jobId); if (!job) throw new Error("Job not found"); diff --git a/apps/pipeline/src/server/style-selection.ts b/apps/pipeline/src/server/style-selection.ts index 3bfdf9a8..0165fcdb 100644 --- a/apps/pipeline/src/server/style-selection.ts +++ b/apps/pipeline/src/server/style-selection.ts @@ -49,13 +49,14 @@ function getStyleSelectionPath(bookRoot: string): string { export function initStyleSelection(bookRoot: string): StyleSelectionState { const now = Date.now(); + const isFreeRun = process.env.FREE_RUN === "true"; const state: StyleSelectionState = { - status: "generating_auto_style", + status: isFreeRun ? "complete" : "generating_auto_style", autoStyle: null, userPrompt: null, userStyle: null, previews: null, - selected: null, + selected: isFreeRun ? "auto" : null, timeoutAt: null, startedAt: now, updatedAt: now, diff --git a/apps/pipeline/src/tools/identifyEntityAndRewriteParagraphs.ts b/apps/pipeline/src/tools/identifyEntityAndRewriteParagraphs.ts index 072f3457..76afe885 100644 --- a/apps/pipeline/src/tools/identifyEntityAndRewriteParagraphs.ts +++ b/apps/pipeline/src/tools/identifyEntityAndRewriteParagraphs.ts @@ -7,7 +7,8 @@ import { logger } from "../logger"; import fs from "fs"; import { compareXmlTextContent } from "./new-tooling/compare-chapters-xml"; import { restoreOriginalTextInHtml } from "./new-tooling/restore-text-in-html"; -import { restoreUnwrappedBlocks } from "./new-tooling/restore-unwrapped-blocks"; +import { restoreUnwrappedLines } from "./new-tooling/restore-unwrapped-lines"; +import { sanitizeNestedParagraphs } from "./new-tooling/sanitize-nested-paragraphs"; import path from "path"; import { type NewReferenceCardsResponse } from "../types"; import { writeBookFile } from "../helpers/writeBookFile"; @@ -123,7 +124,14 @@ async function processChunk( ); writeBookFile(`compiled-prompt-for-chapter-${chapter}-chunk-${chunkIndex}.md`, compiledPrompt); - const llmProviders = [callGeminiWrapper, callGrok, callClaude, callGpt5]; + const llmProviders = [ + callGeminiWrapper, + callGeminiWrapper, + callGeminiWrapper, + callGrok, + callClaude, + callGpt5, + ]; try { const selectedProvider = llmProviders[attempt % llmProviders.length]; @@ -141,6 +149,10 @@ async function processChunk( logger.info(`Response for chapter ${chapter} chunk ${chunkIndex}:`, response.slice(0, 50)); const clearedResponse = response.replace(/```xml\n/, "").replace(/\n```$/, ""); + writeBookFile( + `rewritten-paragraphs-for-chapter-${chapter}-chunk-${chunkIndex}-${selectedProvider.name}.raw.xml`, + clearedResponse, + ); let restored = clearedResponse; try { @@ -150,10 +162,16 @@ async function processChunk( } try { - restored = restoreUnwrappedBlocks(originalChunkXml, restored); + restored = restoreUnwrappedLines(originalChunkXml, restored); + } catch (e) { + logger.error(`Error restoring unwrapped lines for chapter ${chapter} chunk ${chunkIndex}`, e); + } + + try { + restored = sanitizeNestedParagraphs(restored); } catch (e) { logger.error( - `Error restoring unwrapped blocks for chapter ${chapter} chunk ${chunkIndex}`, + `Error sanitizing nested paragraphs for chapter ${chapter} chunk ${chunkIndex}`, e, ); } @@ -302,9 +320,15 @@ export const identifyAndRewriteParagraphs = async ( } try { - restored = restoreUnwrappedBlocks(paragraphsForPage, restored); + restored = restoreUnwrappedLines(paragraphsForPage, restored); + } catch (e) { + logger.error("Error restoring unwrapped lines for chapter " + chapter, e); + } + + try { + restored = sanitizeNestedParagraphs(restored); } catch (e) { - logger.error("Error restoring unwrapped blocks for chapter " + chapter, e); + logger.error("Error sanitizing nested paragraphs for chapter " + chapter, e); } if (restored && compareXmlTextContent(paragraphsForPage, restored)) { diff --git a/apps/pipeline/src/tools/new-tooling/generate-flux-schnel-image.ts b/apps/pipeline/src/tools/new-tooling/generate-flux-schnel-image.ts index 36842ad4..5acd7c02 100644 --- a/apps/pipeline/src/tools/new-tooling/generate-flux-schnel-image.ts +++ b/apps/pipeline/src/tools/new-tooling/generate-flux-schnel-image.ts @@ -8,7 +8,7 @@ import { sanitizePromptForModeration, generateAbstractPortraitPrompt, } from "./generate-pictures-for-entities"; -import { logError } from "src/helpers/logError"; +import { logError } from "../../helpers/logError"; const replicate = new Replicate({ auth: process.env.REPLICATE_API_TOKEN }); diff --git a/apps/pipeline/src/tools/new-tooling/generate-pictures-for-entities.ts b/apps/pipeline/src/tools/new-tooling/generate-pictures-for-entities.ts index 3e41542b..ad76f011 100644 --- a/apps/pipeline/src/tools/new-tooling/generate-pictures-for-entities.ts +++ b/apps/pipeline/src/tools/new-tooling/generate-pictures-for-entities.ts @@ -280,7 +280,10 @@ ${chapters .join("\n")} `; - const charactersXml = characterNames.map((name) => ``).join("\n"); + const charactersXml = characterNames + .filter((name) => name !== "generic-avatar") + .map((name) => ``) + .join("\n"); prompt = initialPrompt .replace("{{characters}}", `${charactersXml}`) .replace("{{bookText}}", bookText); @@ -288,6 +291,11 @@ ${chapters const response = await callGeminiWithThinkingAndSchemaAndParsed(prompt, CharactersSchema); logger.info(`Response: `, response); + response.characters.push({ + name: "generic-avatar", + visualGuide: + "A mysterious figure shown from behind or in silhouette. No distinct facial features visible. Anonymous, sexless, suitable for representing any unnamed character. Atmospheric lighting with the figure partially obscured by shadow or mist.", + }); return response; }; @@ -300,7 +308,15 @@ export const generatePicturesForEntities = async ( let generatedPrompts: CharactersType; if (bookFileExists("generated-prompts.json", FILE_TYPE.TEMPORARY)) { generatedPrompts = JSON.parse(readBookFile("generated-prompts.json", FILE_TYPE.TEMPORARY)); - console.log("inside generated prompts"); + console.log("[generatePicturesForEntities] Using cached generated-prompts.json"); + console.log( + "[generatePicturesForEntities] Cached characters:", + generatedPrompts.characters.map((c) => c.name), + ); + console.log( + "[generatePicturesForEntities] Reference cards characters:", + referenceCards.characters.map((c) => c.name), + ); } else { generatedPrompts = await generatePicturePrompts(referenceCards, { skipBookAnalysis }); writeBookFile( diff --git a/apps/pipeline/src/tools/new-tooling/generate-prompts-for-backgrounds.ts b/apps/pipeline/src/tools/new-tooling/generate-prompts-for-backgrounds.ts index c132a1ae..8d1a6070 100644 --- a/apps/pipeline/src/tools/new-tooling/generate-prompts-for-backgrounds.ts +++ b/apps/pipeline/src/tools/new-tooling/generate-prompts-for-backgrounds.ts @@ -104,6 +104,11 @@ export type GenerateBackgroundsOptions = { }; export const generateBackgrounds = async (options: GenerateBackgroundsOptions = {}) => { + if (FREE_RUN) { + logger.info("FREE_RUN enabled - skipping background generation."); + return; + } + const { customStyle, chapterNumbers, diff --git a/apps/pipeline/src/tools/new-tooling/get-chapter-by-chapter-summary.ts b/apps/pipeline/src/tools/new-tooling/get-chapter-by-chapter-summary.ts index e5453e3a..814c9dd2 100644 --- a/apps/pipeline/src/tools/new-tooling/get-chapter-by-chapter-summary.ts +++ b/apps/pipeline/src/tools/new-tooling/get-chapter-by-chapter-summary.ts @@ -5,7 +5,7 @@ import { writeBookFile } from "../../helpers/writeBookFile"; import { readBookFile } from "../../helpers/readBookFile"; import { FILE_TYPE } from "../../helpers/filesHelpers"; import { logger } from "../../logger"; -import { callGrokAzure } from "src/callGrokAzure"; +import { callGrokAzure } from "../../callGrokAzure"; export const makeRollingChapterSummaries = async () => { const bookSettings = getBookSettings(); diff --git a/apps/pipeline/src/tools/new-tooling/get-chapter-by-chapter-with-paragraphs-json-summary.ts b/apps/pipeline/src/tools/new-tooling/get-chapter-by-chapter-with-paragraphs-json-summary.ts index cac1c095..783b5afe 100644 --- a/apps/pipeline/src/tools/new-tooling/get-chapter-by-chapter-with-paragraphs-json-summary.ts +++ b/apps/pipeline/src/tools/new-tooling/get-chapter-by-chapter-with-paragraphs-json-summary.ts @@ -7,7 +7,7 @@ import { readBookFile } from "../../helpers/readBookFile"; import { FILE_TYPE } from "../../helpers/filesHelpers"; import { writeBookFile } from "../../helpers/writeBookFile"; import { callSlowGeminiWithThinkingAndSchemaAndParsed } from "../../callFastGemini"; -import { callGrokAzureWithSchema } from "src/callGrokAzure"; +import { callGrokAzureWithSchema } from "../../callGrokAzure"; import { buildParagraphsForSummary } from "./summaryParagraphs"; // Define the schema for reference cards response diff --git a/apps/pipeline/src/tools/new-tooling/sanitize-nested-paragraphs.spec.ts b/apps/pipeline/src/tools/new-tooling/sanitize-nested-paragraphs.spec.ts new file mode 100644 index 00000000..e1bbeb51 --- /dev/null +++ b/apps/pipeline/src/tools/new-tooling/sanitize-nested-paragraphs.spec.ts @@ -0,0 +1,24 @@ +import { describe, expect, it } from "vitest"; +import { sanitizeNestedParagraphs } from "./sanitize-nested-paragraphs"; + +describe("sanitizeNestedParagraphs", () => { + it("removes nested

tags inside a paragraph and keeps inner text", () => { + const input = + '

Alpha

\u201cOho!\u201d said the board.

Omega

'; + + const expected = "

Alpha \u201cOho!\u201d said the board. Omega

"; + + expect(sanitizeNestedParagraphs(input)).toBe(expected); + }); + + it("leaves normal paragraphs unchanged", () => { + const input = "

One.

\n

Two.

"; + expect(sanitizeNestedParagraphs(input)).toBe(input); + }); + + it("handles multiple nested paragraphs in one block", () => { + const input = '

Start

A

mid

B

end

'; + const expected = "

Start A mid B end

"; + expect(sanitizeNestedParagraphs(input)).toBe(expected); + }); +}); diff --git a/apps/pipeline/src/tools/new-tooling/sanitize-nested-paragraphs.ts b/apps/pipeline/src/tools/new-tooling/sanitize-nested-paragraphs.ts new file mode 100644 index 00000000..c6126436 --- /dev/null +++ b/apps/pipeline/src/tools/new-tooling/sanitize-nested-paragraphs.ts @@ -0,0 +1,50 @@ +export function sanitizeNestedParagraphs(html: string): string { + let depth = 0; + let out = ""; + let i = 0; + + while (i < html.length) { + const char = html[i]; + if (char !== "<") { + out += char; + i += 1; + continue; + } + + const closeIdx = html.indexOf(">", i); + if (closeIdx === -1) { + out += html.slice(i); + break; + } + + const tag = html.slice(i, closeIdx + 1); + const lower = tag.toLowerCase(); + + if (lower.startsWith("= 1) { + // Nested

-> drop tag, keep content inline + } else { + out += tag; + } + depth += 1; + i = closeIdx + 1; + continue; + } + + if (lower.startsWith(" 1) { + // Closing nested

-> drop tag + } else { + out += tag; + } + depth = Math.max(0, depth - 1); + i = closeIdx + 1; + continue; + } + + out += tag; + i = closeIdx + 1; + } + + return out; +} diff --git a/apps/player/src/context/BookConvexContext.tsx b/apps/player/src/context/BookConvexContext.tsx index 4ad1626f..ea1c24e0 100644 --- a/apps/player/src/context/BookConvexContext.tsx +++ b/apps/player/src/context/BookConvexContext.tsx @@ -362,6 +362,7 @@ export function BookConvexProvider({ bookPath, children }: BookConvexProviderPro const characters = useMemo(() => { if (!charactersQuery) return []; + console.log(`charactersQuery`, charactersQuery); return charactersQuery.map((c) => ({ path: c.path, slug: c.slug, diff --git a/apps/player/src/ui/activateMediaInRange.ts b/apps/player/src/ui/activateMediaInRange.ts index cb729042..2d3b0312 100644 --- a/apps/player/src/ui/activateMediaInRange.ts +++ b/apps/player/src/ui/activateMediaInRange.ts @@ -314,6 +314,7 @@ function populateInlineAvatarShell( characterData: CharacterData | undefined, location: { chapter: number; paragraph: number } | null, snapshotOverride?: CharacterSnapshot | null, + genericCharacter?: CharacterData, ): boolean { if (shell.querySelector("img")) { return false; @@ -325,9 +326,38 @@ function populateInlineAvatarShell( return false; } + // Handle unknown characters (not in Convex) if (!characterData) { - console.warn(`[populateInlineAvatarShell] ${characterSlug}: no characterData provided`); - return false; + console.log("generic", genericCharacter); + // Try generic avatar first, then SVG fallback + const genericAvatarUrl = genericCharacter?.media?.avatarUrl; + console.log(`[populateInlineAvatarShell] Unknown character "${characterSlug}":`, { + hasGenericCharacter: !!genericCharacter, + genericCharacterSlug: genericCharacter?.slug, + genericAvatarUrl, + genericMedia: genericCharacter?.media, + }); + const fallbackSrc = genericAvatarUrl ?? generateFallbackAvatarUrl(characterSlug); + + // Generate display name from slug: "other-board-members" -> "Other Board Members" + const displayName = characterSlug.replace(/-/g, " ").replace(/\b\w/g, (c) => c.toUpperCase()); + + shell.title = displayName; + + const placeholderImg = document.createElement("img"); + placeholderImg.src = normalizeSrcForInlineAvatar(fallbackSrc); + placeholderImg.classList.add( + "absolute", + "top-0", + "left-0", + "w-full", + "h-full", + "object-cover", + "rounded-full", + ); + placeholderImg.alt = displayName; + shell.appendChild(placeholderImg); + return true; } const snapshot = @@ -462,6 +492,7 @@ export function activateMediaInRange( ) { const charactersData = getCharactersData(); const charactersBySlug = new Map(charactersData.map((c) => [c.slug, c])); + const genericCharacter = charactersBySlug.get("generic-avatar"); if (isPlayFormat && !isMobile()) { const activeParagraph = document.querySelector(`.active-paragraph`); @@ -570,6 +601,7 @@ export function activateMediaInRange( characterData, locationForPlaceholder, snapshot, + genericCharacter, ); } } else { @@ -680,7 +712,9 @@ export const openPlayRowCharacterModal = ( }; export function hydrateInlineAvatarsInSection(section: HTMLElement): void { + console.log("getCharactersData", getCharactersData()); const charactersBySlug = new Map(getCharactersData().map((c) => [c.slug, c])); + const genericCharacter = charactersBySlug.get("generic-avatar"); const chapterAttr = section.dataset.chapter; const chapterNumber = chapterAttr ? parseInt(chapterAttr, 10) : 0; @@ -694,7 +728,7 @@ export function hydrateInlineAvatarsInSection(section: HTMLElement): void { const paragraphIndex = paragraphEl?.dataset.index ? parseInt(paragraphEl.dataset.index, 10) : 0; const location = { chapter: chapterNumber, paragraph: paragraphIndex }; - populateInlineAvatarShell(shell, characterData, location); + populateInlineAvatarShell(shell, characterData, location, undefined, genericCharacter); }); const personaCells = section.querySelectorAll( diff --git a/convex/lib/characterDataV2.ts b/convex/lib/characterDataV2.ts index d1a67fea..64f41778 100644 --- a/convex/lib/characterDataV2.ts +++ b/convex/lib/characterDataV2.ts @@ -342,6 +342,16 @@ export function mergeV2ToCharacterData( avatar: o.avatar, })), }); + } else { + if (slug === "generic-avatar") { + result.push({ + slug, + characterName: meta.name, + bookSlug, + infoPerChapter: [], + media: meta.media, + }); + } } } From 1bf9a2ade1f74e8d49bd902d6bedde63ed229506 Mon Sep 17 00:00:00 2001 From: Lukasz Gandecki Date: Thu, 5 Feb 2026 16:01:59 +0100 Subject: [PATCH 4/4] zooming on images, html edit in convex, pipeline improvements --- apps/pipeline/AGENTS.md | 1 + apps/pipeline/src/callClaude.ts | 6 +- apps/pipeline/src/callFastGemini.ts | 47 +--- apps/pipeline/src/callO3.ts | 2 +- .../src/server/continue-pipeline-cli.ts | 9 +- .../pipeline/src/server/parallel-scheduler.ts | 1 + apps/pipeline/src/server/pipeline-progress.ts | 1 + apps/pipeline/src/server/pipeline.ts | 67 ++++- .../pipeline/src/server/upload-figures-cli.ts | 107 +++++++ apps/pipeline/src/shared/pipelineTypes.ts | 2 + apps/pipeline/src/tools/fixLongXml.ts | 2 +- .../wrapChaptersWithSections.ts | 2 +- .../identifyEntityAndRewriteParagraphs.ts | 5 +- .../restore-unwrapped-lines.spec.ts | 76 +++++ .../new-tooling/restore-unwrapped-lines.ts | 19 +- .../src/components/modals/ImageZoomModal.tsx | 266 ++++++++++++++++++ apps/player/src/features/ModalRenderers.tsx | 2 + .../imageZoom/ImageZoomModalRenderer.tsx | 20 ++ apps/player/src/hooks/useBookContent.ts | 19 +- .../src/stores/modals/imageModal.store.ts | 44 +++ apps/player/src/styles/se-semantics.css | 1 + convex/characterPromptGeneration.ts | 2 +- convex/generator.ts | 2 + convex/paragraphEditor.ts | 8 +- 24 files changed, 656 insertions(+), 55 deletions(-) create mode 100644 apps/pipeline/src/server/upload-figures-cli.ts create mode 100644 apps/player/src/components/modals/ImageZoomModal.tsx create mode 100644 apps/player/src/features/modals/imageZoom/ImageZoomModalRenderer.tsx create mode 100644 apps/player/src/stores/modals/imageModal.store.ts diff --git a/apps/pipeline/AGENTS.md b/apps/pipeline/AGENTS.md index 78a579d1..3cb77b26 100644 --- a/apps/pipeline/AGENTS.md +++ b/apps/pipeline/AGENTS.md @@ -24,6 +24,7 @@ bun src/continue-pipeline-cli.ts books-data/my-book --status | ----------------------------- | ------------------------- | -------------------------------- | | `import_epub` | EPUB → FB2 → rich.xml | `input/rich.xml` | | `create_settings` | Detect language, metadata | `bookSettings.json` | +| `upload_figures` | Upload SE figures | Convex `books/*/figures` | | `generate_reference_cards` | Character summaries | `single-summary-per-person.json` | | `rewrite_paragraphs` | Inject character tags | `rewritten-paragraphs-*.xml` | | `generate_graphical_style` | Visual style JSON | `graphicalStyle.json` | diff --git a/apps/pipeline/src/callClaude.ts b/apps/pipeline/src/callClaude.ts index ce96f32e..a0368aff 100644 --- a/apps/pipeline/src/callClaude.ts +++ b/apps/pipeline/src/callClaude.ts @@ -185,7 +185,11 @@ export const callClaude = async ( }; const doIt = async () => { - const result = await callClaude("Identify all named book characters (people) in this page.\n"); + const result = (await callGeminiWrapper( + "Identify all named book characters (people) in this page.\n", + undefined, + 1, + )) as string; logger.info(result); }; // Execute only if this file is being run directly (not imported) diff --git a/apps/pipeline/src/callFastGemini.ts b/apps/pipeline/src/callFastGemini.ts index 1a285004..361c89eb 100644 --- a/apps/pipeline/src/callFastGemini.ts +++ b/apps/pipeline/src/callFastGemini.ts @@ -76,44 +76,21 @@ Based on the book text answer the user's question, using quotes from the wider b }; export const callGeminiWithThinking = async (prompt: string) => { - const ai = new GoogleGenAI({ apiKey: process.env.GOOGLE_GENERATIVE_AI_API_KEY }); - const config = { - responseMimeType: "text/plain", - httpOptions: { - timeout: 15 * 60 * 1000, // 15 minutes in milliseconds - }, - }; - const model = "gemini-3-flash-preview"; - // const model = "gemini-3-pro-preview"; - - const contents = [{ role: "user", parts: [{ text: prompt }] }]; const safetySettings = [ - { category: HarmCategory.HARM_CATEGORY_HARASSMENT, threshold: HarmBlockThreshold.BLOCK_NONE }, - { category: HarmCategory.HARM_CATEGORY_HATE_SPEECH, threshold: HarmBlockThreshold.BLOCK_NONE }, - { - category: HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT, - threshold: HarmBlockThreshold.BLOCK_NONE, - }, - { - category: HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT, - threshold: HarmBlockThreshold.BLOCK_NONE, - }, - { - category: HarmCategory.HARM_CATEGORY_CIVIC_INTEGRITY, - threshold: HarmBlockThreshold.BLOCK_NONE, - }, + { category: HarmCategory.HARM_CATEGORY_HARASSMENT, threshold: HarmBlockThreshold.OFF }, + { category: HarmCategory.HARM_CATEGORY_HATE_SPEECH, threshold: HarmBlockThreshold.OFF }, + { category: HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT, threshold: HarmBlockThreshold.OFF }, + { category: HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT, threshold: HarmBlockThreshold.OFF }, + { category: HarmCategory.HARM_CATEGORY_CIVIC_INTEGRITY, threshold: HarmBlockThreshold.OFF }, ]; - - console.log("before response", model); - const response = await ai.models.generateContent({ - model, - config: { ...config, safetySettings }, - contents, + console.log("CALLING GEMINI WITH THINKING"); + const { text } = await generateText({ + model: google("gemini-3-flash-preview"), + prompt, + experimental_telemetry: { isEnabled: true, recordInputs: true, recordOutputs: true }, + providerOptions: { google: { safetySettings } }, }); - - console.log("after response"); - - return response?.text; + return text; }; export const callGeminiWithThinkingAndSchema = async ( diff --git a/apps/pipeline/src/callO3.ts b/apps/pipeline/src/callO3.ts index a14782fc..36869adf 100644 --- a/apps/pipeline/src/callO3.ts +++ b/apps/pipeline/src/callO3.ts @@ -35,7 +35,7 @@ export const callGpt5 = async ( ) => { const chatCompletion = await client.chat.completions.create({ messages: [{ role: "user", content: prompt }], - model: "gpt-5.1", + model: "gpt-5.2", reasoning_effort: "medium", }); return chatCompletion.choices[0].message.content as string; diff --git a/apps/pipeline/src/server/continue-pipeline-cli.ts b/apps/pipeline/src/server/continue-pipeline-cli.ts index 1336fd9d..6dffa9b6 100644 --- a/apps/pipeline/src/server/continue-pipeline-cli.ts +++ b/apps/pipeline/src/server/continue-pipeline-cli.ts @@ -135,6 +135,7 @@ async function main() { // Determine which step to start from let fromStep: Step | undefined; + let completedSteps: Step[] | undefined; if (argv["from-step"]) { const specifiedStep = argv["from-step"] as string; @@ -147,6 +148,12 @@ async function main() { console.log(`Starting from specified step: ${fromStep} (${StepLabels[fromStep]})`); } else { // Auto-detect from progress file + const progress = readProgress(slug); + if (progress) { + completedSteps = Object.entries(progress.completedSteps) + .filter(([, value]) => value.status === "done") + .map(([step]) => step as Step); + } fromStep = getNextStep(slug) || undefined; if (fromStep) { console.log(`Auto-detected next step: ${fromStep} (${StepLabels[fromStep]})`); @@ -158,7 +165,7 @@ async function main() { console.log(`\nContinuing pipeline for slug="${slug}" from step: ${fromStep}`); - const job = await startPipeline({ slug, fromStep }); + const job = await startPipeline({ slug, fromStep, completedSteps }); // Poll job state and stream logs let lastLogIndex = 0; diff --git a/apps/pipeline/src/server/parallel-scheduler.ts b/apps/pipeline/src/server/parallel-scheduler.ts index dc73177e..068a0081 100644 --- a/apps/pipeline/src/server/parallel-scheduler.ts +++ b/apps/pipeline/src/server/parallel-scheduler.ts @@ -5,6 +5,7 @@ export type StepDependency = { step: Step; deps: Step[] }; export const STEP_DEPENDENCIES: StepDependency[] = [ { step: "import_epub", deps: [] }, { step: "create_settings", deps: ["import_epub"] }, + { step: "upload_figures", deps: ["create_settings"] }, { step: "generate_reference_cards", deps: ["create_settings"] }, { step: "rewrite_paragraphs", deps: ["generate_reference_cards"] }, { step: "generate_graphical_style", deps: ["create_settings"] }, diff --git a/apps/pipeline/src/server/pipeline-progress.ts b/apps/pipeline/src/server/pipeline-progress.ts index 3c05aa56..4e54bed0 100644 --- a/apps/pipeline/src/server/pipeline-progress.ts +++ b/apps/pipeline/src/server/pipeline-progress.ts @@ -22,6 +22,7 @@ export interface PipelineProgress { const STEP_ORDER: Step[] = [ "import_epub", "create_settings", + "upload_figures", "generate_reference_cards", "rewrite_paragraphs", "generate_graphical_style", diff --git a/apps/pipeline/src/server/pipeline.ts b/apps/pipeline/src/server/pipeline.ts index f3ed0054..d534ca66 100644 --- a/apps/pipeline/src/server/pipeline.ts +++ b/apps/pipeline/src/server/pipeline.ts @@ -132,6 +132,8 @@ function getContentType(filename: string): string { ".png": "image/png", ".jpg": "image/jpeg", ".jpeg": "image/jpeg", + ".gif": "image/gif", + ".svg": "image/svg+xml", ".webp": "image/webp", ".mp4": "video/mp4", ".webm": "video/webm", @@ -471,14 +473,62 @@ async function uploadGraphicalStyleToConvex(job: Job, tempOutputDir: string) { } } +async function uploadFiguresToConvex(job: Job, repoRoot: string) { + const seBookDir = path.join(repoRoot, "standardebooks-data", "books", job.slug); + const metadataPath = path.join(seBookDir, "metadata.json"); + const imagesDir = path.join(seBookDir, "images"); + + if (!fs.existsSync(metadataPath) || !fs.existsSync(imagesDir)) { + addLog(job, "No Standard Ebooks images detected - skipping figure upload"); + return; + } + + const files = fs.readdirSync(imagesDir).filter((f) => /\.(png|jpe?g|gif|svg|webp)$/i.test(f)); + + if (files.length === 0) { + addLog(job, `No figure images found in ${imagesDir}`); + return; + } + + addLog(job, `Uploading ${files.length} Standard Ebooks figures to Convex...`); + + let uploaded = 0; + for (const file of files) { + const filePath = path.join(imagesDir, file); + try { + const content = fs.readFileSync(filePath); + await convex.uploadFile({ + folderPath: `${job.bookPath}/figures`, + basename: file, + content, + contentType: getContentType(file), + }); + uploaded++; + } catch (e) { + const msg = e instanceof Error ? e.message : String(e); + addLog(job, `⚠ Failed to upload figure ${file}: ${msg}`); + } + } + + addLog(job, `✔ Figures uploaded: ${uploaded}/${files.length}`); +} + export async function startPipeline(input: { epubPath?: string; fb2Path?: string; slug?: string; ebookConvertBin?: string; fromStep?: Step; + completedSteps?: Step[]; }) { - const { epubPath, fb2Path, slug: providedSlug, ebookConvertBin, fromStep } = input; + const { + epubPath, + fb2Path, + slug: providedSlug, + ebookConvertBin, + fromStep, + completedSteps, + } = input; const baseName = epubPath ? path.basename(epubPath, path.extname(epubPath)) : null; const slug = providedSlug || slugify(baseName || "book"); const bookPath = `books/${slug}`; @@ -487,6 +537,7 @@ export async function startPipeline(input: { const stepOrder = getStepOrder(); const fromStepIndex = fromStep ? getStepIndex(fromStep) : -1; + const completedStepSet = new Set(completedSteps ?? []); const job: Job = { id: uuidv4(), @@ -498,6 +549,9 @@ export async function startPipeline(input: { logs: [], steps: stepOrder.map((step) => { const stepIndex = getStepIndex(step); + if (completedStepSet.has(step) && step !== fromStep) { + return { step, status: "done" as const }; + } if (fromStepIndex > 0 && stepIndex < fromStepIndex) { return { step, status: "done" as const }; } @@ -517,6 +571,11 @@ export async function startPipeline(input: { const tempOutputDir = path.join(bookRoot, "temporary-output"); const schedulerState = createSchedulerState(); + for (const step of job.steps) { + if (step.status === "done") { + schedulerState.completedSteps.add(step.step); + } + } let referenceCards: NewReferenceCardsResponse; initStyleSelection(bookRoot); @@ -579,6 +638,10 @@ export async function startPipeline(input: { }); }, + upload_figures: async () => { + await uploadFiguresToConvex(job, repoRoot); + }, + generate_reference_cards: async () => { setBookArg(slug); const fileName = "single-summary-per-person.json"; @@ -876,8 +939,6 @@ export async function startPipeline(input: { "upload_answer_server_data", ]; for (const skip of skipSteps) { - const idx = stepsToRun.indexOf(skip); - if (idx !== -1) stepsToRun.splice(idx, 1); schedulerState.completedSteps.add(skip); const s = job.steps.find((x) => x.step === skip); if (s) s.status = "done"; diff --git a/apps/pipeline/src/server/upload-figures-cli.ts b/apps/pipeline/src/server/upload-figures-cli.ts new file mode 100644 index 00000000..dabc479b --- /dev/null +++ b/apps/pipeline/src/server/upload-figures-cli.ts @@ -0,0 +1,107 @@ +#!/usr/bin/env tsx +import path from "path"; +import fs from "fs"; +import dotenv from "dotenv"; +import { convex } from "./convex-client"; + +dotenv.config(); + +function getRepoRoot(): string { + return path.resolve(__dirname, "../../"); +} + +function getContentType(filename: string): string { + const ext = path.extname(filename).toLowerCase(); + const types: Record = { + ".png": "image/png", + ".jpg": "image/jpeg", + ".jpeg": "image/jpeg", + ".gif": "image/gif", + ".svg": "image/svg+xml", + ".webp": "image/webp", + }; + return types[ext] || "application/octet-stream"; +} + +function usage() { + console.log("Usage: bun src/server/upload-figures-cli.ts [--source ]"); + console.log("Examples:"); + console.log( + " bun src/server/upload-figures-cli.ts agatha-christie_the-mysterious-affair-at-styles", + ); + console.log( + " bun src/server/upload-figures-cli.ts agatha-christie_the-mysterious-affair-at-styles-openai --source agatha-christie_the-mysterious-affair-at-styles", + ); +} + +async function main() { + const args = process.argv.slice(2); + const targetSlug = args[0]; + + if (!targetSlug) { + usage(); + process.exit(1); + } + + const sourceFlagIndex = args.findIndex((arg) => arg === "--source" || arg === "--from"); + const sourceSlug = + sourceFlagIndex !== -1 && args[sourceFlagIndex + 1] ? args[sourceFlagIndex + 1] : targetSlug; + + const repoRoot = getRepoRoot(); + const seBookDir = path.join(repoRoot, "standardebooks-data", "books", sourceSlug); + const metadataPath = path.join(seBookDir, "metadata.json"); + const imagesDir = path.join(seBookDir, "images"); + + if (!fs.existsSync(metadataPath)) { + console.error(`Standard Ebooks metadata not found: ${metadataPath}`); + process.exit(1); + } + + if (!fs.existsSync(imagesDir)) { + console.error(`Images directory not found: ${imagesDir}`); + process.exit(1); + } + + const files = fs.readdirSync(imagesDir).filter((f) => /\.(png|jpe?g|gif|svg|webp)$/i.test(f)); + + if (files.length === 0) { + console.log(`No figure images found in ${imagesDir}`); + process.exit(0); + } + + console.log( + `Uploading ${files.length} figures from "${sourceSlug}" to books/${targetSlug}/figures...`, + ); + + let uploaded = 0; + let failed = 0; + + for (const file of files) { + const filePath = path.join(imagesDir, file); + try { + const content = fs.readFileSync(filePath); + await convex.uploadFile({ + folderPath: `books/${targetSlug}/figures`, + basename: file, + content, + contentType: getContentType(file), + }); + uploaded++; + console.log(`✔ ${file}`); + } catch (e) { + failed++; + const msg = e instanceof Error ? e.message : String(e); + console.error(`✖ ${file}: ${msg}`); + } + } + + console.log(`Done. Uploaded ${uploaded}/${files.length} figures.`); + if (failed > 0) { + process.exit(1); + } +} + +main().catch((err) => { + console.error(err); + process.exit(1); +}); diff --git a/apps/pipeline/src/shared/pipelineTypes.ts b/apps/pipeline/src/shared/pipelineTypes.ts index affc1417..e7adfd0a 100644 --- a/apps/pipeline/src/shared/pipelineTypes.ts +++ b/apps/pipeline/src/shared/pipelineTypes.ts @@ -40,6 +40,7 @@ export type StyleSelectionState = z.infer; export const StepEnum = z.enum([ "import_epub", "create_settings", + "upload_figures", "generate_reference_cards", "rewrite_paragraphs", "generate_graphical_style", @@ -59,6 +60,7 @@ export type Step = z.infer; export const StepLabels: Record = { import_epub: "Import EPUB", create_settings: "Create Settings", + upload_figures: "Upload Figures", generate_reference_cards: "Generate Reference Cards", rewrite_paragraphs: "Rewrite Paragraphs", generate_graphical_style: "Generate Graphical Style", diff --git a/apps/pipeline/src/tools/fixLongXml.ts b/apps/pipeline/src/tools/fixLongXml.ts index 2b4f94d2..9fa1de99 100644 --- a/apps/pipeline/src/tools/fixLongXml.ts +++ b/apps/pipeline/src/tools/fixLongXml.ts @@ -60,7 +60,7 @@ function formatChapterElement(chapter: Element, serializer: XMLSerializer): stri function formatSource(xml: string): string { const parser = new DOMParser({ onError: () => {} }); - const doc = parser.parseFromString(xml, "text/xml"); + const doc = parser.parseFromString(xml, "text/html"); const serializer = new XMLSerializer(); // If the file is just one , format that; otherwise format all Chapters found. diff --git a/apps/pipeline/src/tools/generate-book-cli/wrapChaptersWithSections.ts b/apps/pipeline/src/tools/generate-book-cli/wrapChaptersWithSections.ts index a54cc7de..3a8df7c2 100644 --- a/apps/pipeline/src/tools/generate-book-cli/wrapChaptersWithSections.ts +++ b/apps/pipeline/src/tools/generate-book-cli/wrapChaptersWithSections.ts @@ -17,7 +17,7 @@ import { JSDOM } from "jsdom"; * whitespace‑agnostic. */ export function wrapChaptersWithSections(xml: string): string { - const dom = new JSDOM(xml, { contentType: "text/xml" }); + const dom = new JSDOM(xml, { contentType: "text/html" }); const doc = dom.window.document; const NS = doc.documentElement.namespaceURI || null; diff --git a/apps/pipeline/src/tools/identifyEntityAndRewriteParagraphs.ts b/apps/pipeline/src/tools/identifyEntityAndRewriteParagraphs.ts index 76afe885..5359f4e9 100644 --- a/apps/pipeline/src/tools/identifyEntityAndRewriteParagraphs.ts +++ b/apps/pipeline/src/tools/identifyEntityAndRewriteParagraphs.ts @@ -1,4 +1,4 @@ -import { callClaude, callGeminiWrapper } from "../callClaude"; +import { callGeminiWrapper } from "../callClaude"; import { getParagraphsFromChapter, getSectionAttributesFromChapter, @@ -129,7 +129,6 @@ async function processChunk( callGeminiWrapper, callGeminiWrapper, callGrok, - callClaude, callGpt5, ]; @@ -299,7 +298,7 @@ export const identifyAndRewriteParagraphs = async ( writeBookFile(`compiled-prompt-for-chapter-${chapter}-gemini2.md`, compiledPrompt); - const llmProviders = [callGeminiWrapper, callGrok, callClaude, callGpt5]; + const llmProviders = [callGeminiWrapper, callGeminiWrapper, callGrok, callGpt5]; try { const selectedProvider = llmProviders[attempt % llmProviders.length]; diff --git a/apps/pipeline/src/tools/new-tooling/restore-unwrapped-lines.spec.ts b/apps/pipeline/src/tools/new-tooling/restore-unwrapped-lines.spec.ts index e8326e41..fe8d8936 100644 --- a/apps/pipeline/src/tools/new-tooling/restore-unwrapped-lines.spec.ts +++ b/apps/pipeline/src/tools/new-tooling/restore-unwrapped-lines.spec.ts @@ -33,4 +33,80 @@ describe("restoreUnwrappedLines", () => { expect(restoreUnwrappedLines(original, model)).toBe(expected); }); + + it("repairs a line that closes a

without opening it", () => { + const original = [ + "

'Then you must know where you found it?'

", + "

'Yes, it was on the prisoner's wardrobe.'

", + "

'That is better.'

", + ].join("\n"); + + const model = [ + "

'Then you must know where you found it?'

", + "'Yes, it was on the prisoner's wardrobe.'

", + "

'That is better.'

", + ].join("\n"); + + const expected = [ + "

'Then you must know where you found it?'

", + "

'Yes, it was on the prisoner's wardrobe.'

", + "

'That is better.'

", + ].join("\n"); + + expect(restoreUnwrappedLines(original, model)).toBe(expected); + }); + + it("repairs orphan

when model output is a single line", () => { + const original = [ + "

'Then you must know where you found it?'

", + "

'Yes, it was on the prisoner's wardrobe.'

", + "

'That is better.'

", + ].join("\n"); + + const model = + "

'Then you must know where you found it?'

" + + "'Yes, it was on the prisoner's wardrobe.'

" + + "

'That is better.'

"; + + const expected = [ + "

'Then you must know where you found it?'

", + "

'Yes, it was on the prisoner's wardrobe.'

", + "

'That is better.'

", + ].join("\n"); + + expect(restoreUnwrappedLines(original, model)).toBe(expected); + }); + + it("wraps multiple consecutive bare lines using original wrappers", () => { + const original = ["

One.

", "

Two.

", "

Three.

"].join("\n"); + + const model = ["

One.

", "Two.", "Three."].join("\n"); + + const expected = ["

One.

", "

Two.

", "

Three.

"].join("\n"); + + expect(restoreUnwrappedLines(original, model)).toBe(expected); + }); + + it("wraps a bare line with inline tags and trailing

", () => { + const original = ["

She saw the prisoner.

", "

It was unexpected.

"].join("\n"); + + const model = [ + "

She saw the prisoner.

", + ' It was unexpected.

', + ].join("\n"); + + const expected = [ + "

She saw the prisoner.

", + '

It was unexpected.

', + ].join("\n"); + + expect(restoreUnwrappedLines(original, model)).toBe(expected); + }); + + it("keeps concatenated

tags on a single line unchanged", () => { + const original = ["

One.

", "

Two.

"].join("\n"); + const model = "

One.

Two.

"; + + expect(restoreUnwrappedLines(original, model)).toBe(model); + }); }); diff --git a/apps/pipeline/src/tools/new-tooling/restore-unwrapped-lines.ts b/apps/pipeline/src/tools/new-tooling/restore-unwrapped-lines.ts index 30a94a1d..758393fe 100644 --- a/apps/pipeline/src/tools/new-tooling/restore-unwrapped-lines.ts +++ b/apps/pipeline/src/tools/new-tooling/restore-unwrapped-lines.ts @@ -53,7 +53,14 @@ export function restoreUnwrappedLines(originalInner: string, modelInner: string) const blocks = extractOriginalBlocks(originalInner); if (blocks.length === 0) return modelInner; - const lines = modelInner.split(/\r?\n/); + const parser = new DOMParser(); + let normalizedModel = modelInner.includes("\n") + ? modelInner + : modelInner.replace(/<\/p>\s*/gi, "

\n").replace(/\s*\s*$/i); + if (closingPMatch) { + lineContent = lineContent.slice(0, closingPMatch.index).trimEnd(); + } + + const normalizedText = normalizeText(getTextContent(lineContent, parser)); const matchIndex = findMatchingIndex(blocks, originalIndex, normalizedText); if (matchIndex >= 0) { const indentMatch = line.match(/^\s*/); const indent = indentMatch ? indentMatch[0] : ""; const block = blocks[matchIndex]; - output.push(`${indent}${block.openTag}${trimmed}${block.closeTag}`); + output.push(`${indent}${block.openTag}${lineContent}${block.closeTag}`); originalIndex = matchIndex + 1; changed = true; continue; diff --git a/apps/player/src/components/modals/ImageZoomModal.tsx b/apps/player/src/components/modals/ImageZoomModal.tsx new file mode 100644 index 00000000..1907709c --- /dev/null +++ b/apps/player/src/components/modals/ImageZoomModal.tsx @@ -0,0 +1,266 @@ +import React, { useCallback, useEffect, useRef } from "react"; +import { X } from "lucide-react"; + +import { Dialog, DialogContent, DialogTitle } from "@player/components/ui/dialog"; +import { cn } from "@player/lib/utils"; + +interface ImageZoomModalProps { + src: string; + alt?: string; + onClose: () => void; +} + +type PointerPosition = { x: number; y: number }; + +const MIN_SCALE = 1; +const MAX_SCALE = 5; + +const clamp = (value: number, min: number, max: number) => Math.min(max, Math.max(min, value)); + +const getDistance = (a: PointerPosition, b: PointerPosition) => { + const dx = a.x - b.x; + const dy = a.y - b.y; + return Math.hypot(dx, dy); +}; + +const getMidpoint = (a: PointerPosition, b: PointerPosition) => ({ + x: (a.x + b.x) / 2, + y: (a.y + b.y) / 2, +}); + +const ImageZoomModal: React.FC = ({ src, alt, onClose }) => { + const containerRef = useRef(null); + const imgRef = useRef(null); + const baseSizeRef = useRef<{ width: number; height: number } | null>(null); + const pointersRef = useRef>(new Map()); + const lastPanRef = useRef(null); + const pinchRef = useRef<{ + startDistance: number; + startScale: number; + startX: number; + startY: number; + startMid: PointerPosition; + } | null>(null); + const transformRef = useRef({ scale: 1, x: 0, y: 0 }); + const rafRef = useRef(null); + + const applyTransform = useCallback(() => { + if (rafRef.current !== null) return; + rafRef.current = window.requestAnimationFrame(() => { + rafRef.current = null; + const img = imgRef.current; + if (!img) return; + const { scale, x, y } = transformRef.current; + img.style.transform = `translate(${x}px, ${y}px) scale(${scale})`; + }); + }, []); + + const clampTranslate = useCallback(() => { + const container = containerRef.current; + const base = baseSizeRef.current; + if (!container || !base) return; + + const rect = container.getBoundingClientRect(); + const { scale } = transformRef.current; + + if (scale <= 1) { + transformRef.current.x = 0; + transformRef.current.y = 0; + return; + } + + const maxOffsetX = Math.max(0, (base.width * scale - rect.width) / 2); + const maxOffsetY = Math.max(0, (base.height * scale - rect.height) / 2); + + transformRef.current.x = clamp(transformRef.current.x, -maxOffsetX, maxOffsetX); + transformRef.current.y = clamp(transformRef.current.y, -maxOffsetY, maxOffsetY); + }, []); + + const resetTransform = useCallback(() => { + transformRef.current = { scale: 1, x: 0, y: 0 }; + if (imgRef.current) { + imgRef.current.style.transform = "translate(0px, 0px) scale(1)"; + } + }, []); + + const handleImageLoad = useCallback(() => { + resetTransform(); + if (imgRef.current) { + const rect = imgRef.current.getBoundingClientRect(); + baseSizeRef.current = { width: rect.width, height: rect.height }; + } + }, [resetTransform]); + + const handlePointerDown = useCallback((event: React.PointerEvent) => { + if (event.button !== 0) return; + const container = containerRef.current; + if (!container) return; + + container.setPointerCapture(event.pointerId); + pointersRef.current.set(event.pointerId, { x: event.clientX, y: event.clientY }); + + if (pointersRef.current.size === 1) { + lastPanRef.current = { x: event.clientX, y: event.clientY }; + pinchRef.current = null; + } else if (pointersRef.current.size === 2) { + const [p1, p2] = Array.from(pointersRef.current.values()); + pinchRef.current = { + startDistance: getDistance(p1, p2), + startScale: transformRef.current.scale, + startX: transformRef.current.x, + startY: transformRef.current.y, + startMid: getMidpoint(p1, p2), + }; + lastPanRef.current = null; + } + }, []); + + const handlePointerMove = useCallback( + (event: React.PointerEvent) => { + if (!pointersRef.current.has(event.pointerId)) return; + + pointersRef.current.set(event.pointerId, { x: event.clientX, y: event.clientY }); + + if (pointersRef.current.size === 1) { + if (transformRef.current.scale <= 1) return; + const last = lastPanRef.current; + if (!last) return; + const dx = event.clientX - last.x; + const dy = event.clientY - last.y; + transformRef.current.x += dx; + transformRef.current.y += dy; + lastPanRef.current = { x: event.clientX, y: event.clientY }; + clampTranslate(); + applyTransform(); + return; + } + + if (pointersRef.current.size >= 2 && pinchRef.current) { + const [p1, p2] = Array.from(pointersRef.current.values()); + const dist = getDistance(p1, p2); + const nextScale = clamp( + pinchRef.current.startScale * (dist / pinchRef.current.startDistance), + MIN_SCALE, + MAX_SCALE, + ); + const mid = getMidpoint(p1, p2); + const dx = mid.x - pinchRef.current.startMid.x; + const dy = mid.y - pinchRef.current.startMid.y; + transformRef.current.scale = nextScale; + transformRef.current.x = pinchRef.current.startX + dx; + transformRef.current.y = pinchRef.current.startY + dy; + clampTranslate(); + applyTransform(); + } + }, + [applyTransform, clampTranslate], + ); + + const handlePointerUp = useCallback( + (event: React.PointerEvent) => { + if (!pointersRef.current.has(event.pointerId)) return; + pointersRef.current.delete(event.pointerId); + if (pointersRef.current.size < 2) { + pinchRef.current = null; + } + if (pointersRef.current.size === 1) { + const [remaining] = Array.from(pointersRef.current.values()); + lastPanRef.current = remaining ?? null; + } else { + lastPanRef.current = null; + } + + if (transformRef.current.scale <= 1) { + transformRef.current = { scale: 1, x: 0, y: 0 }; + applyTransform(); + } + }, + [applyTransform], + ); + + useEffect(() => { + const container = containerRef.current; + if (!container) return; + + const handleWheel = (event: WheelEvent) => { + event.preventDefault(); + const { scale, x, y } = transformRef.current; + const nextScale = clamp(scale * (1 - event.deltaY * 0.0015), MIN_SCALE, MAX_SCALE); + if (nextScale === scale) return; + + const rect = container.getBoundingClientRect(); + const pointer = { + x: event.clientX - rect.left - rect.width / 2, + y: event.clientY - rect.top - rect.height / 2, + }; + const scaleRatio = nextScale / scale; + transformRef.current.scale = nextScale; + transformRef.current.x = x + (1 - scaleRatio) * pointer.x; + transformRef.current.y = y + (1 - scaleRatio) * pointer.y; + + clampTranslate(); + applyTransform(); + }; + + container.addEventListener("wheel", handleWheel, { passive: false }); + return () => { + container.removeEventListener("wheel", handleWheel); + }; + }, [applyTransform, clampTranslate]); + + useEffect(() => { + resetTransform(); + }, [src, resetTransform]); + + return ( + !open && onClose()} modal> + {alt || "Image"} + { + // Prevent dragging the dialog itself + e.preventDefault(); + }} + > + + +
+ {alt +
+
+
+ ); +}; + +export default ImageZoomModal; diff --git a/apps/player/src/features/ModalRenderers.tsx b/apps/player/src/features/ModalRenderers.tsx index c7628cb8..0920a86f 100644 --- a/apps/player/src/features/ModalRenderers.tsx +++ b/apps/player/src/features/ModalRenderers.tsx @@ -15,6 +15,7 @@ import { MusicEditModalRenderer } from "./modals/musicEdit/MusicEditModalRendere import { MusicAddModalRenderer } from "./modals/musicAdd/MusicAddModalRenderer"; import { NoteEditModalRenderer } from "./modals/noteEdit/NoteEditModalRenderer"; import { GraphicsSettingsModalRenderer } from "./modals/graphicsSettings/GraphicsSettingsModalRenderer"; +import { ImageZoomModalRenderer } from "./modals/imageZoom/ImageZoomModalRenderer"; export const ModalRenderers: React.FC = () => { return ( @@ -35,6 +36,7 @@ export const ModalRenderers: React.FC = () => { + ); }; diff --git a/apps/player/src/features/modals/imageZoom/ImageZoomModalRenderer.tsx b/apps/player/src/features/modals/imageZoom/ImageZoomModalRenderer.tsx new file mode 100644 index 00000000..d61c7042 --- /dev/null +++ b/apps/player/src/features/modals/imageZoom/ImageZoomModalRenderer.tsx @@ -0,0 +1,20 @@ +import React from "react"; +import { createPortal } from "react-dom"; +import { AnimatePresence } from "motion/react"; + +import { useImageModal } from "@player/stores/modals/imageModal.store"; +import { useEscapeKey } from "@player/hooks/useEscapeKey"; +import ImageZoomModal from "@player/components/modals/ImageZoomModal"; + +export const ImageZoomModalRenderer: React.FC = () => { + const { isOpen, src, alt, closeModal } = useImageModal(); + + useEscapeKey(isOpen, closeModal); + + return createPortal( + + {isOpen && src ? : null} + , + document.body, + ); +}; diff --git a/apps/player/src/hooks/useBookContent.ts b/apps/player/src/hooks/useBookContent.ts index 9a04e11e..5d917901 100644 --- a/apps/player/src/hooks/useBookContent.ts +++ b/apps/player/src/hooks/useBookContent.ts @@ -8,6 +8,7 @@ import { replaceXmlTagsIntoHtmlTags } from "@player/helpers/replaceXmlTagsIntoHt import { activateCharacterInteractions } from "@player/helpers/activateCharacterInteractions"; import { activateFootnoteInteractions } from "@player/helpers/activateFootnoteInteractions"; import { useBookConvex } from "@player/context/BookConvexContext"; +import { useImageModal } from "@player/stores/modals/imageModal.store"; import { markLayoutUnstable, LAYOUT_UNSTABLE_VIRTUALIZER_MS, @@ -48,6 +49,7 @@ export function useBookContent() { const { currentChapter } = location; const { isPlayFormat } = useBookForm(); const { openModal: openCharacterDetailsModal } = useCharacterModal(); + const { openModal: openImageModal } = useImageModal(); // Initialize to -1 so the first real version (0 or 1) is always detected as a change // Using textVersion as initial value would miss the first update if component mounts after version change @@ -77,6 +79,21 @@ export function useBookContent() { const target = event.target as HTMLElement; + const imageTarget = target instanceof HTMLImageElement ? target : target.closest("img"); + if (imageTarget) { + const isInlineAvatar = imageTarget.closest(".inline-avatar"); + const isInChapter = imageTarget.closest("section[data-chapter]"); + if (!isInlineAvatar && isInChapter) { + const src = imageTarget.currentSrc || imageTarget.src; + if (src) { + event.preventDefault(); + event.stopPropagation(); + openImageModal({ src, alt: imageTarget.alt || undefined }); + return; + } + } + } + const isInlineAvatar = target.closest(".inline-avatar"); const isCharacterHighlighted = target.classList.contains("character-highlighted-activated"); const isCharacterPlaceholder = target.closest(".character-placeholder"); @@ -163,7 +180,7 @@ export function useBookContent() { activateFootnoteInteractions(complexitySpan); setSentenceAsClicked(currentSentenceId); }, - [openCharacterDetailsModal, isPlayFormat], + [openCharacterDetailsModal, openImageModal, isPlayFormat], ); useEffect(() => { diff --git a/apps/player/src/stores/modals/imageModal.store.ts b/apps/player/src/stores/modals/imageModal.store.ts new file mode 100644 index 00000000..b2d59f8b --- /dev/null +++ b/apps/player/src/stores/modals/imageModal.store.ts @@ -0,0 +1,44 @@ +import { create } from "zustand"; +import { devtools } from "zustand/middleware"; + +import { useModalCoordinator } from "../modalCoordinator.store"; + +export interface ImageModalParams { + src: string; + alt?: string; +} + +const MODAL_ID = "image-modal"; + +interface ImageModalState { + isOpen: boolean; + src?: string; + alt?: string; + + openModal: (params: ImageModalParams) => void; + closeModal: () => void; +} + +export const useImageModal = create()( + devtools( + (set) => ({ + isOpen: false, + src: undefined, + alt: undefined, + + openModal: ({ src, alt }: ImageModalParams) => { + const coordinator = useModalCoordinator.getState(); + if (coordinator.requestModalOpen(MODAL_ID)) { + set({ isOpen: true, src, alt }); + } + }, + + closeModal: () => { + const coordinator = useModalCoordinator.getState(); + coordinator.releaseModal(MODAL_ID); + set({ isOpen: false, src: undefined, alt: undefined }); + }, + }), + { name: "image-modal" }, + ), +); diff --git a/apps/player/src/styles/se-semantics.css b/apps/player/src/styles/se-semantics.css index a7ece189..d037e1c9 100644 --- a/apps/player/src/styles/se-semantics.css +++ b/apps/player/src/styles/se-semantics.css @@ -290,6 +290,7 @@ margin: auto; max-height: 100vh; max-width: 100%; + cursor: zoom-in; } figure.full-page { diff --git a/convex/characterPromptGeneration.ts b/convex/characterPromptGeneration.ts index 7a81b31c..290b929a 100644 --- a/convex/characterPromptGeneration.ts +++ b/convex/characterPromptGeneration.ts @@ -97,7 +97,7 @@ export const generateCharacterPrompt = internalAction({ } const parser = new DOMParser(); - const doc = parser.parseFromString(xmlResult.content, "text/xml"); + const doc = parser.parseFromString(xmlResult.content, "text/html"); const section = doc.getElementsByTagName("section")[0] as XmlDomElement; if (!section) { diff --git a/convex/generator.ts b/convex/generator.ts index b760a3c6..e8f8d37e 100644 --- a/convex/generator.ts +++ b/convex/generator.ts @@ -5,6 +5,7 @@ import { adminMutation, publicQuery } from "./functions"; const PIPELINE_STEPS = [ "import_epub", "create_settings", + "upload_figures", "generate_reference_cards", "rewrite_paragraphs", "generate_graphical_style", @@ -88,6 +89,7 @@ export const ensureBookStructure = adminMutation({ await createFolder(`${bookPath}/characters-data`); await createFolder(`${bookPath}/backgrounds`); await createFolder(`${bookPath}/music`); + await createFolder(`${bookPath}/figures`); const existingBook = await ctx.db .query("books") diff --git a/convex/paragraphEditor.ts b/convex/paragraphEditor.ts index 74a28bf6..26e75070 100644 --- a/convex/paragraphEditor.ts +++ b/convex/paragraphEditor.ts @@ -241,7 +241,7 @@ export const setParagraphSpeaker = bookAction({ console.log("[setParagraphSpeaker] HTML length:", htmlResult.content.length); const parser = new DOMParser(); - const doc = parser.parseFromString(htmlResult.content, "text/xml"); + const doc = parser.parseFromString(htmlResult.content, "text/html"); const paragraph = findParagraphByIndex(doc, paragraphIndex); console.log( @@ -345,7 +345,7 @@ export const modifyCharacterTag = bookAction({ } const parser = new DOMParser(); - const doc = parser.parseFromString(htmlResult.content, "text/xml"); + const doc = parser.parseFromString(htmlResult.content, "text/html"); const paragraph = findParagraphByIndex(doc, paragraphIndex); if (!paragraph) { @@ -451,7 +451,7 @@ export const wrapTextWithCharacter = bookAction({ } const parser = new DOMParser(); - const doc = parser.parseFromString(htmlResult.content, "text/xml"); + const doc = parser.parseFromString(htmlResult.content, "text/html"); const paragraph = findParagraphByIndex(doc, paragraphIndex); if (!paragraph) { @@ -547,7 +547,7 @@ export const removeNoteFromChapter = bookAction({ } const parser = new DOMParser(); - const doc = parser.parseFromString(htmlResult.content, "text/xml"); + const doc = parser.parseFromString(htmlResult.content, "text/html"); let noteFound = false;