diff --git a/.gitignore b/.gitignore index 494a3e8d..9eba0883 100644 --- a/.gitignore +++ b/.gitignore @@ -68,3 +68,6 @@ fastlane/test_output claude-agent-sdk-demos xcuserdata + +apps/pipeline/app.log +.tmp diff --git a/apps/pipeline-ui/src/pages/StandardEbooksPage.tsx b/apps/pipeline-ui/src/pages/StandardEbooksPage.tsx index 6717d5df..e9b8f891 100644 --- a/apps/pipeline-ui/src/pages/StandardEbooksPage.tsx +++ b/apps/pipeline-ui/src/pages/StandardEbooksPage.tsx @@ -1,7 +1,7 @@ import { useState, useEffect, useCallback, useRef, useLayoutEffect } from "react"; import { useNavigate } from "react-router-dom"; import { trpc } from "../trpc"; -import { Loader2, ArrowLeft, BookOpen } from "lucide-react"; +import { Loader2, ArrowLeft, BookOpen, Search, X } from "lucide-react"; import { BookCard, type CollectionBook } from "../components/BookCard"; import { BookModal } from "../components/BookModal"; import { Button } from "@/components/ui/button"; @@ -136,15 +136,18 @@ function AuthorLetterRow({ export function StandardEbooksPage() { const navigate = useNavigate(); const [groupedBooks, setGroupedBooks] = useState>({}); + const [allBooks, setAllBooks] = useState([]); const [isLoading, setIsLoading] = useState(true); const [totalBooks, setTotalBooks] = useState(0); const [modalBook, setModalBook] = useState(null); + const [searchQuery, setSearchQuery] = useState(""); useEffect(() => { const loadData = async () => { try { const data = await trpc.getStandardEbooksIndex.query(); setGroupedBooks(data.groupedByAuthorLetter); + setAllBooks(data.books); setTotalBooks(data.books.length); } catch (e) { console.error("Failed to load Standard Ebooks index:", e); @@ -178,7 +181,33 @@ export function StandardEbooksPage() { ); } - const sortedLetters = Object.keys(groupedBooks).sort(); + const normalizedQuery = searchQuery.trim().toLowerCase(); + const filteredBooks = normalizedQuery + ? allBooks.filter((book) => { + const haystack = [ + book.title, + book.author, + book.authorFileAs, + book.description, + book.subjects.join(" "), + ] + .join(" ") + .toLowerCase(); + return haystack.includes(normalizedQuery); + }) + : allBooks; + + const visibleGroupedBooks = normalizedQuery + ? filteredBooks.reduce>((acc, book) => { + const firstLetter = (book.authorFileAs || book.author).charAt(0).toUpperCase(); + if (!acc[firstLetter]) acc[firstLetter] = []; + acc[firstLetter].push(book); + return acc; + }, {}) + : groupedBooks; + + const sortedLetters = Object.keys(visibleGroupedBooks).sort(); + const visibleCount = normalizedQuery ? filteredBooks.length : totalBooks; return (
@@ -189,7 +218,7 @@ export function StandardEbooksPage() { Standard Ebooks

- {totalBooks} professionally formatted public domain books + {visibleCount} professionally formatted public domain books

-
- {sortedLetters.map((letter, idx) => ( -
- +
+
+ + setSearchQuery(e.target.value)} + placeholder="Search by title, author, or subject..." + className="w-full pl-10 pr-10 py-2 rounded-md bg-background border border-border text-foreground placeholder:text-muted-foreground focus:outline-none focus:ring-2 focus:ring-primary/50" /> + {searchQuery.trim() && ( + + )}
- ))} +
+ {normalizedQuery + ? `Showing ${visibleCount} of ${totalBooks} books` + : `${totalBooks} books`} +
+
+
+ +
+ {sortedLetters.length === 0 ? ( +
+ No books match your search. +
+ ) : ( + sortedLetters.map((letter, idx) => ( +
+ +
+ )) + )}
diff --git a/apps/pipeline/AGENTS.md b/apps/pipeline/AGENTS.md index 78a579d1..3cb77b26 100644 --- a/apps/pipeline/AGENTS.md +++ b/apps/pipeline/AGENTS.md @@ -24,6 +24,7 @@ bun src/continue-pipeline-cli.ts books-data/my-book --status | ----------------------------- | ------------------------- | -------------------------------- | | `import_epub` | EPUB → FB2 → rich.xml | `input/rich.xml` | | `create_settings` | Detect language, metadata | `bookSettings.json` | +| `upload_figures` | Upload SE figures | Convex `books/*/figures` | | `generate_reference_cards` | Character summaries | `single-summary-per-person.json` | | `rewrite_paragraphs` | Inject character tags | `rewritten-paragraphs-*.xml` | | `generate_graphical_style` | Visual style JSON | `graphicalStyle.json` | diff --git a/apps/pipeline/package.json b/apps/pipeline/package.json index ac690da2..2764023e 100644 --- a/apps/pipeline/package.json +++ b/apps/pipeline/package.json @@ -19,12 +19,14 @@ "typecheck": "tsgo --noEmit --incremental" }, "dependencies": { - "@ai-sdk/anthropic": "2.0.38", - "@ai-sdk/cerebras": "^1.0.20", - "@ai-sdk/google": "^2.0.14", - "@ai-sdk/groq": "^2.0.21", - "@ai-sdk/openai": "^2.0.30", - "@ai-sdk/provider": "^2.0.0", + "@ai-sdk/anthropic": "^3.0.36", + "@ai-sdk/azure": "^3.0.26", + "@ai-sdk/cerebras": "^2.0.30", + "@ai-sdk/google": "^3.0.21", + "@ai-sdk/groq": "^3.0.21", + "@ai-sdk/openai": "^3.0.25", + "@ai-sdk/provider": "^3.0.7", + "@ai-sdk/react": "^3.0.73", "@anthropic-ai/claude-agent-sdk": "^0.1.59", "@anthropic-ai/sdk": "^0.39.0", "@arcjet/bun": "^1.0.0-beta.15", @@ -37,7 +39,7 @@ "@google-cloud/vertexai": "^1.9.3", "@google/genai": "^0.9.0", "@google/generative-ai": "^0.24.0", - "@openrouter/ai-sdk-provider": "^1.2.0", + "@openrouter/ai-sdk-provider": "^2.1.1", "@runwayml/sdk": "^2.0.2", "@sentry/node": "^10.11.0", "@trpc/server": "^11.0.0-rc.660", @@ -47,7 +49,7 @@ "@types/multer": "^2.0.0", "@types/ws": "^8.18.0", "@xmldom/xmldom": "^0.9.8", - "ai": "5.0.81", + "ai": "^6.0.71", "async_hooks": "^1.0.0", "axios": "^1.8.1", "chalk": "^5.4.1", diff --git a/apps/pipeline/src/callClaude.ts b/apps/pipeline/src/callClaude.ts index ce96f32e..a0368aff 100644 --- a/apps/pipeline/src/callClaude.ts +++ b/apps/pipeline/src/callClaude.ts @@ -185,7 +185,11 @@ export const callClaude = async ( }; const doIt = async () => { - const result = await callClaude("Identify all named book characters (people) in this page.\n"); + const result = (await callGeminiWrapper( + "Identify all named book characters (people) in this page.\n", + undefined, + 1, + )) as string; logger.info(result); }; // Execute only if this file is being run directly (not imported) diff --git a/apps/pipeline/src/callFastGemini.ts b/apps/pipeline/src/callFastGemini.ts index 1a285004..4f01ad00 100644 --- a/apps/pipeline/src/callFastGemini.ts +++ b/apps/pipeline/src/callFastGemini.ts @@ -6,10 +6,9 @@ import { } from "@google/genai"; import { type z } from "zod"; import { google } from "@ai-sdk/google"; -import { generateObject, generateText, streamText, wrapLanguageModel } from "ai"; +import { generateObject, generateText, streamText } from "ai"; import { toGeminiSchema } from "gemini-zod"; import "dotenv/config"; -import { openrouter } from "@openrouter/ai-sdk-provider"; import type { LanguageModelV2Middleware } from "@ai-sdk/provider"; export const callFastGemini = async ( @@ -76,44 +75,25 @@ Based on the book text answer the user's question, using quotes from the wider b }; export const callGeminiWithThinking = async (prompt: string) => { - const ai = new GoogleGenAI({ apiKey: process.env.GOOGLE_GENERATIVE_AI_API_KEY }); - const config = { - responseMimeType: "text/plain", - httpOptions: { - timeout: 15 * 60 * 1000, // 15 minutes in milliseconds - }, - }; - const model = "gemini-3-flash-preview"; - // const model = "gemini-3-pro-preview"; - - const contents = [{ role: "user", parts: [{ text: prompt }] }]; const safetySettings = [ - { category: HarmCategory.HARM_CATEGORY_HARASSMENT, threshold: HarmBlockThreshold.BLOCK_NONE }, - { category: HarmCategory.HARM_CATEGORY_HATE_SPEECH, threshold: HarmBlockThreshold.BLOCK_NONE }, - { - category: HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT, - threshold: HarmBlockThreshold.BLOCK_NONE, - }, - { - category: HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT, - threshold: HarmBlockThreshold.BLOCK_NONE, - }, - { - category: HarmCategory.HARM_CATEGORY_CIVIC_INTEGRITY, - threshold: HarmBlockThreshold.BLOCK_NONE, - }, + { category: HarmCategory.HARM_CATEGORY_HARASSMENT, threshold: HarmBlockThreshold.OFF }, + { category: HarmCategory.HARM_CATEGORY_HATE_SPEECH, threshold: HarmBlockThreshold.OFF }, + { category: HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT, threshold: HarmBlockThreshold.OFF }, + { category: HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT, threshold: HarmBlockThreshold.OFF }, + { category: HarmCategory.HARM_CATEGORY_CIVIC_INTEGRITY, threshold: HarmBlockThreshold.OFF }, ]; - - console.log("before response", model); - const response = await ai.models.generateContent({ - model, - config: { ...config, safetySettings }, - contents, + console.log("CALLING GEMINI WITH THINKING"); + const { textStream } = await streamText({ + model: google("gemini-3-flash-preview"), + prompt, + experimental_telemetry: { isEnabled: true, recordInputs: true, recordOutputs: true }, + providerOptions: { google: { safetySettings } }, }); - - console.log("after response"); - - return response?.text; + let text = ""; + for await (const textPart of textStream) { + text += textPart; + } + return text; }; export const callGeminiWithThinkingAndSchema = async ( @@ -270,54 +250,6 @@ export const anthropicThinkingSchemaMiddleware: LanguageModelV2Middleware = { }, }; -export const callSlowGeminiWithThinkingAndSchemaAndParsed = async ( - prompt: string, - zodSchema: z.ZodSchema, - model: string = "google/gemini-3-flash-preview", -) => { - const claudeModel = wrapLanguageModel({ - model: openrouter(model), - middleware: anthropicThinkingSchemaMiddleware, - }); - const { object } = await generateObject({ - model: - model.includes("claude") || model.includes("minimax") || model.includes("kimi") - ? claudeModel - : openrouter(model), - schema: zodSchema, - prompt, - experimental_telemetry: { isEnabled: true, recordInputs: true, recordOutputs: true }, - providerOptions: { - openrouter: { - safetySettings: [ - { - category: HarmCategory.HARM_CATEGORY_HARASSMENT, - threshold: HarmBlockThreshold.BLOCK_NONE, - }, - { - category: HarmCategory.HARM_CATEGORY_HATE_SPEECH, - threshold: HarmBlockThreshold.BLOCK_NONE, - }, - { - category: HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT, - threshold: HarmBlockThreshold.BLOCK_NONE, - }, - { - category: HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT, - threshold: HarmBlockThreshold.BLOCK_NONE, - }, - { - category: HarmCategory.HARM_CATEGORY_CIVIC_INTEGRITY, - threshold: HarmBlockThreshold.BLOCK_NONE, - }, - ], - }, - }, - }); - - return object as T; -}; - export const callGeminiWithImage = async ( prompt: string, imageBase64: string, diff --git a/apps/pipeline/src/callGrokAzure.ts b/apps/pipeline/src/callGrokAzure.ts new file mode 100644 index 00000000..13d29f94 --- /dev/null +++ b/apps/pipeline/src/callGrokAzure.ts @@ -0,0 +1,50 @@ +import OpenAI from "openai"; +import { type z } from "zod"; + +const endpoint = "https://bookgenius.services.ai.azure.com/openai/v1/"; +const model = "grok-4-fast-reasoning"; +const api_key = process.env.AZURE_GROK_KEY; + +const client = new OpenAI({ baseURL: endpoint, apiKey: api_key }); + +export const callGrokAzure = async (prompt: string) => { + const completion = await client.chat.completions.create({ + messages: [{ role: "user", content: prompt }], + model, + }); + + return completion.choices[0].message.content; +}; + +export const callGrokAzureWithSchema = async (prompt: string, zodSchema: z.ZodSchema) => { + const completion = await client.chat.completions.create({ + messages: [{ role: "user", content: prompt }], + model, + response_format: { + type: "json_schema", + json_schema: { + name: "response", + strict: true, + // @ts-expect-error(zod typing) + schema: zodSchema.shape, + }, + }, + }); + let result: T; + try { + result = JSON.parse(completion.choices[0].message.content as string) as T; + } catch (e) { + console.error("Error parsing JSON", e); + throw e; + } + return result; +}; + +// if (require.main === module) { +// const schema = z.object({ name: z.string(), age: z.number() }); +// const prompt = "What is my name? My name is John Doe and I'm 30"; +// const result = await callGrokAzureWithSchema(prompt, schema); +// console.log(result); +// console.log(result.name); +// console.log(result.age); +// } diff --git a/apps/pipeline/src/callO3.ts b/apps/pipeline/src/callO3.ts index 0e498b4e..36869adf 100644 --- a/apps/pipeline/src/callO3.ts +++ b/apps/pipeline/src/callO3.ts @@ -22,7 +22,6 @@ export const callO3WithSchema = async ( model: openai(model), schema: zodSchema, prompt, - // providerOptions: { google: { thinkingConfig: { thinkingBudget: 0, includeThoughts: true } } }, experimental_telemetry: { isEnabled: true, recordInputs: true, recordOutputs: true }, }); @@ -36,7 +35,7 @@ export const callGpt5 = async ( ) => { const chatCompletion = await client.chat.completions.create({ messages: [{ role: "user", content: prompt }], - model: "gpt-5.1", + model: "gpt-5.2", reasoning_effort: "medium", }); return chatCompletion.choices[0].message.content as string; diff --git a/apps/pipeline/src/helpers/logError.ts b/apps/pipeline/src/helpers/logError.ts new file mode 100644 index 00000000..0e62486a --- /dev/null +++ b/apps/pipeline/src/helpers/logError.ts @@ -0,0 +1,8 @@ +export function logError(contextMessage: string, err: unknown) { + if (err instanceof Error) { + console.error(`${contextMessage} ${err.message}`); + console.error(err.stack); + return; + } + console.error(`${contextMessage} ${String(err)}`); +} diff --git a/apps/pipeline/src/lib/domParser.ts b/apps/pipeline/src/lib/domParser.ts new file mode 100644 index 00000000..0f0301e7 --- /dev/null +++ b/apps/pipeline/src/lib/domParser.ts @@ -0,0 +1,16 @@ +import { JSDOM } from "jsdom"; + +let initialized = false; + +export function ensureDomParser(): void { + if (typeof (globalThis as { DOMParser?: unknown }).DOMParser !== "undefined") { + return; + } + if (initialized) { + return; + } + + const { window } = new JSDOM(""); + (globalThis as { DOMParser: typeof window.DOMParser }).DOMParser = window.DOMParser; + initialized = true; +} diff --git a/apps/pipeline/src/lib/paragraphCount.ts b/apps/pipeline/src/lib/paragraphCount.ts new file mode 100644 index 00000000..9cf8bb1c --- /dev/null +++ b/apps/pipeline/src/lib/paragraphCount.ts @@ -0,0 +1,10 @@ +import { + countParagraphsFromChapterHtml, + type ParagraphCountOptions, +} from "@player/services/htmlNormalizer"; +import { ensureDomParser } from "./domParser"; + +export function computeParagraphCount(html: string, options?: ParagraphCountOptions): number { + ensureDomParser(); + return countParagraphsFromChapterHtml(html, options); +} diff --git a/apps/pipeline/src/scripts/fix-legacy-play-chapters.ts b/apps/pipeline/src/scripts/fix-legacy-play-chapters.ts new file mode 100644 index 00000000..1a0f8993 --- /dev/null +++ b/apps/pipeline/src/scripts/fix-legacy-play-chapters.ts @@ -0,0 +1,134 @@ +#!/usr/bin/env bun +import fs from "fs"; +import os from "os"; +import path from "path"; +import { fixLegacyPlayDidaskalia } from "../tools/fix-legacy-play-didaskalia"; +import { fixLegacyPlayCustomTags } from "../tools/fix-legacy-play-custom-tags"; +import { fixLegacyPlayStageDirections } from "../tools/fix-legacy-play-stage-directions"; +import { + applyMultiSpeakerMapToHtml, + extractMultiSpeakerNextLineMapFromXml, +} from "../tools/fix-legacy-play-multi-speaker"; + +type Args = { bookSlug: string; sourceDir: string; outputDir: string; dryRun: boolean }; + +function resolvePath(inputPath: string): string { + if (inputPath.startsWith("~/")) { + return path.join(process.env.HOME ?? "", inputPath.slice(2)); + } + return path.resolve(inputPath); +} + +function parseArgs(): Args { + const args = process.argv.slice(2); + if (args.length < 1) { + console.error( + "Usage: bun apps/pipeline/src/scripts/fix-legacy-play-chapters.ts [--source ] [--out ] [--dry-run]", + ); + process.exit(1); + } + + const bookSlug = args[0]; + const sourceIdx = args.indexOf("--source"); + const outIdx = args.indexOf("--out"); + + const repoRoot = path.resolve(process.cwd()); + const defaultSource = path.join(repoRoot, "ConvexAssets", "books", bookSlug, "chapters-source"); + const defaultOut = path.join(os.tmpdir(), "bookgenius-fixed-chapters", bookSlug); + + return { + bookSlug, + sourceDir: resolvePath(sourceIdx !== -1 ? args[sourceIdx + 1] : defaultSource), + outputDir: resolvePath(outIdx !== -1 ? args[outIdx + 1] : defaultOut), + dryRun: args.includes("--dry-run"), + }; +} + +function isPlayBook(metadataXml: string): boolean { + return /
\s*Play\s*<\/Form>/i.test(metadataXml); +} + +function getChapterNumberFromFilename(filename: string): number | null { + const match = filename.match(/chapter-(\d+)\.html$/i); + if (!match) return null; + return parseInt(match[1], 10); +} + +async function main(): Promise { + const { bookSlug, sourceDir, outputDir, dryRun } = parseArgs(); + const repoRoot = path.resolve(process.cwd()); + const booksContentDir = path.join(repoRoot, "books", bookSlug, "booksContent"); + const metadataPath = path.join(booksContentDir, "metadata.xml"); + + if (!fs.existsSync(metadataPath)) { + console.error(`Missing metadata.xml for ${bookSlug}: ${metadataPath}`); + process.exit(1); + } + + const metadataXml = fs.readFileSync(metadataPath, "utf-8"); + if (!isPlayBook(metadataXml)) { + console.error(`Book ${bookSlug} is not marked as Play.`); + process.exit(1); + } + + if (!fs.existsSync(sourceDir)) { + console.error(`Source directory not found: ${sourceDir}`); + process.exit(1); + } + + const htmlFiles = fs + .readdirSync(sourceDir) + .filter((file) => file.toLowerCase().endsWith(".html")); + + if (htmlFiles.length === 0) { + console.error(`No .html files found in ${sourceDir}`); + process.exit(1); + } + + if (!dryRun) { + fs.mkdirSync(outputDir, { recursive: true }); + } + + let changed = 0; + let processed = 0; + + for (const file of htmlFiles) { + const chapterNumber = getChapterNumberFromFilename(file); + const sourcePath = path.join(sourceDir, file); + const html = fs.readFileSync(sourcePath, "utf-8"); + + let updated = fixLegacyPlayStageDirections(html); + updated = fixLegacyPlayCustomTags(updated); + updated = fixLegacyPlayDidaskalia(updated); + + if (chapterNumber !== null) { + const xmlPath = path.join(booksContentDir, `chapter${chapterNumber}.xml`); + if (fs.existsSync(xmlPath)) { + const xml = fs.readFileSync(xmlPath, "utf-8"); + const map = extractMultiSpeakerNextLineMapFromXml(xml); + updated = applyMultiSpeakerMapToHtml(updated, map); + } + } + + processed += 1; + + if (updated !== html) { + changed += 1; + } + + if (dryRun) { + const note = updated !== html ? " (changed)" : ""; + console.log(`[dry-run] Would write ${file}${note}`); + } else { + const outPath = path.join(outputDir, file); + fs.writeFileSync(outPath, updated, "utf-8"); + } + } + + console.log(`Done. Processed ${processed} chapter(s). Changed ${changed}. Output: ${outputDir}`); +} + +main().catch((error) => { + console.error("Fatal error:", error); + process.exit(1); +}); diff --git a/apps/pipeline/src/scripts/fix-non-play-chapters.ts b/apps/pipeline/src/scripts/fix-non-play-chapters.ts new file mode 100644 index 00000000..ed3056ff --- /dev/null +++ b/apps/pipeline/src/scripts/fix-non-play-chapters.ts @@ -0,0 +1,107 @@ +#!/usr/bin/env bun +import fs from "fs"; +import os from "os"; +import path from "path"; +import { fixNonPlayCustomTags } from "../tools/fix-non-play-custom-tags"; + +type Args = { sourceRoot: string; outputRoot: string; slugs: string[] | null; dryRun: boolean }; + +const PLAY_SLUGS = new Set([ + "Hamlet", + "Macbeth", + "Midsummer-Nights-Dream", + "Othello", + "Romeo-And-Juliet", + "The-Tempest", + "Romeo-And-Juliet-Small", + "Romeo-And-Juliet-Smaller", +]); + +function resolvePath(inputPath: string): string { + if (inputPath.startsWith("~/")) { + return path.join(process.env.HOME ?? "", inputPath.slice(2)); + } + return path.resolve(inputPath); +} + +function parseArgs(): Args { + const args = process.argv.slice(2); + const sourceIdx = args.indexOf("--source"); + const outIdx = args.indexOf("--out"); + const slugsIdx = args.indexOf("--slugs"); + + const repoRoot = path.resolve(process.cwd()); + const defaultSource = path.join(repoRoot, "ConvexAssets", "books"); + const defaultOut = path.join(os.tmpdir(), "bookgenius-fixed-nonplays"); + + const slugs = + slugsIdx !== -1 ? (args[slugsIdx + 1]?.split(",").map((s) => s.trim()) ?? []) : null; + + return { + sourceRoot: resolvePath(sourceIdx !== -1 ? args[sourceIdx + 1] : defaultSource), + outputRoot: resolvePath(outIdx !== -1 ? args[outIdx + 1] : defaultOut), + slugs, + dryRun: args.includes("--dry-run"), + }; +} + +function listSlugs(root: string): string[] { + if (!fs.existsSync(root)) return []; + return fs + .readdirSync(root, { withFileTypes: true }) + .filter((entry) => entry.isDirectory()) + .map((entry) => entry.name); +} + +function listHtmlFiles(dir: string): string[] { + if (!fs.existsSync(dir)) return []; + return fs + .readdirSync(dir) + .filter((file) => file.toLowerCase().endsWith(".html")) + .map((file) => path.join(dir, file)); +} + +async function main(): Promise { + const { sourceRoot, outputRoot, slugs, dryRun } = parseArgs(); + const targetSlugs = slugs ?? listSlugs(sourceRoot); + + let processedBooks = 0; + let processedFiles = 0; + let changedFiles = 0; + + for (const slug of targetSlugs) { + if (PLAY_SLUGS.has(slug)) continue; + + const chaptersDir = path.join(sourceRoot, slug, "chapters-source"); + const files = listHtmlFiles(chaptersDir); + if (files.length === 0) continue; + + processedBooks += 1; + const outDir = path.join(outputRoot, slug, "chapters-source"); + if (!dryRun) { + fs.mkdirSync(outDir, { recursive: true }); + } + + for (const file of files) { + const html = fs.readFileSync(file, "utf-8"); + const updated = fixNonPlayCustomTags(html); + processedFiles += 1; + if (updated !== html) { + changedFiles += 1; + } + if (!dryRun) { + const outPath = path.join(outDir, path.basename(file)); + fs.writeFileSync(outPath, updated, "utf-8"); + } + } + } + + console.log( + `Done. Books: ${processedBooks}, Files: ${processedFiles}, Changed: ${changedFiles}. Output: ${outputRoot}`, + ); +} + +main().catch((error) => { + console.error("Fatal error:", error); + process.exit(1); +}); diff --git a/apps/pipeline/src/scripts/fix-unwrapped-paragraphs-in-temporary-output.ts b/apps/pipeline/src/scripts/fix-unwrapped-paragraphs-in-temporary-output.ts new file mode 100644 index 00000000..5e7ddca2 --- /dev/null +++ b/apps/pipeline/src/scripts/fix-unwrapped-paragraphs-in-temporary-output.ts @@ -0,0 +1,134 @@ +#!/usr/bin/env bun +import fs from "fs"; +import path from "path"; +import { restoreUnwrappedLines } from "../tools/new-tooling/restore-unwrapped-lines"; +import { buildSectionWrapper, extractSectionInner } from "../tools/new-tooling/section-wrapper"; + +const DEFAULT_OUTPUT_ROOT = + "/var/folders/j9/pbqwg7zs4336w7vccnz2xhcw0000gn/T/bookgenius-fixed-unwrapped"; + +type Args = { sourceRoot: string; outputRoot: string; slugs: string[] }; + +function resolvePath(inputPath: string): string { + if (inputPath.startsWith("~/")) { + return path.join(process.env.HOME ?? "", inputPath.slice(2)); + } + return path.resolve(inputPath); +} + +function parseArgs(): Args { + const args = process.argv.slice(2); + const sourceIdx = args.indexOf("--source"); + const outputIdx = args.indexOf("--out"); + const slugsIdx = args.indexOf("--slugs"); + + const repoRoot = path.resolve(process.cwd()); + const defaultSource = path.join(repoRoot, "apps", "pipeline", "books-data"); + + const sourceRoot = resolvePath(sourceIdx !== -1 ? args[sourceIdx + 1] : defaultSource); + const outputRoot = resolvePath(outputIdx !== -1 ? args[outputIdx + 1] : DEFAULT_OUTPUT_ROOT); + + let slugs: string[] = []; + if (slugsIdx !== -1) { + slugs = args[slugsIdx + 1]?.split(",").map((slug) => slug.trim()) ?? []; + } else { + slugs = fs + .readdirSync(sourceRoot, { withFileTypes: true }) + .filter((entry) => entry.isDirectory()) + .map((entry) => entry.name); + } + + return { sourceRoot, outputRoot, slugs }; +} + +function loadOriginalHtml(tempDir: string, chapter: number): string | null { + const directPath = path.join(tempDir, `original-paragraphs-for-chapter-${chapter}.xml`); + if (fs.existsSync(directPath)) { + return fs.readFileSync(directPath, "utf-8"); + } + + const prefix = `original-paragraphs-for-chapter-${chapter}-chunk-`; + const chunkFiles = fs + .readdirSync(tempDir) + .filter((file) => file.startsWith(prefix) && file.endsWith(".xml")) + .map((file) => ({ file, index: Number(file.slice(prefix.length).replace(/\.xml$/, "")) })) + .filter((entry) => Number.isFinite(entry.index)) + .sort((a, b) => a.index - b.index); + + if (chunkFiles.length === 0) return null; + + return chunkFiles + .map((entry) => fs.readFileSync(path.join(tempDir, entry.file), "utf-8")) + .join("\n"); +} + +function ensureDir(dir: string) { + fs.mkdirSync(dir, { recursive: true }); +} + +function main() { + const { sourceRoot, outputRoot, slugs } = parseArgs(); + + let totalFiles = 0; + let changedFiles = 0; + + for (const slug of slugs) { + const tempDir = path.join(sourceRoot, slug, "temporary-output"); + if (!fs.existsSync(tempDir)) { + console.warn(`Skipping ${slug}: temporary-output not found`); + continue; + } + + const outputTempDir = path.join(outputRoot, slug, "temporary-output"); + ensureDir(outputTempDir); + + const rewrittenFiles = fs + .readdirSync(tempDir) + .filter((file) => /^rewritten-paragraphs-for-chapter-\d+\.xml$/.test(file)); + + if (rewrittenFiles.length === 0) { + console.warn(`Skipping ${slug}: no rewritten-paragraphs files`); + continue; + } + + let slugChanged = 0; + + for (const file of rewrittenFiles) { + totalFiles += 1; + const chapter = Number(file.match(/(\d+)/)?.[1]); + if (!Number.isFinite(chapter)) { + continue; + } + + const modelRaw = fs.readFileSync(path.join(tempDir, file), "utf-8"); + const originalRaw = loadOriginalHtml(tempDir, chapter); + if (!originalRaw) { + console.warn(`Skipping ${slug} chapter ${chapter}: original paragraphs not found`); + continue; + } + + const modelExtract = extractSectionInner(modelRaw); + const originalExtract = extractSectionInner(originalRaw); + + const fixedInner = restoreUnwrappedLines(originalExtract.inner, modelExtract.inner); + const hasChanges = fixedInner !== modelExtract.inner; + + const output = hasChanges ? buildSectionWrapper(fixedInner, modelExtract.wrapper) : modelRaw; + + fs.writeFileSync(path.join(outputTempDir, file), output, "utf-8"); + + if (hasChanges) { + slugChanged += 1; + changedFiles += 1; + } + } + + console.log( + `Processed ${slug}: ${rewrittenFiles.length} chapter(s), changed ${slugChanged}. Output: ${outputTempDir}`, + ); + } + + console.log(`Done. Processed ${totalFiles} file(s). Changed ${changedFiles}.`); +} + +main(); diff --git a/apps/pipeline/src/scripts/scan-non-html-tags.ts b/apps/pipeline/src/scripts/scan-non-html-tags.ts new file mode 100644 index 00000000..9775f262 --- /dev/null +++ b/apps/pipeline/src/scripts/scan-non-html-tags.ts @@ -0,0 +1,226 @@ +#!/usr/bin/env bun +import fs from "fs"; +import path from "path"; + +type Args = { sourceRoot: string; slugs: string[]; limit: number }; + +const HTML_TAGS = new Set([ + "a", + "abbr", + "address", + "article", + "aside", + "audio", + "b", + "bdi", + "bdo", + "blockquote", + "br", + "button", + "canvas", + "caption", + "cite", + "code", + "col", + "colgroup", + "data", + "datalist", + "dd", + "del", + "details", + "dfn", + "dialog", + "div", + "dl", + "dt", + "em", + "fieldset", + "figcaption", + "figure", + "footer", + "form", + "h1", + "h2", + "h3", + "h4", + "h5", + "h6", + "head", + "header", + "hr", + "html", + "i", + "img", + "input", + "ins", + "kbd", + "label", + "legend", + "li", + "link", + "main", + "map", + "mark", + "meta", + "meter", + "nav", + "noscript", + "object", + "ol", + "optgroup", + "option", + "output", + "p", + "param", + "picture", + "pre", + "progress", + "q", + "rp", + "rt", + "ruby", + "s", + "samp", + "script", + "section", + "select", + "small", + "source", + "span", + "strong", + "style", + "sub", + "summary", + "sup", + "table", + "tbody", + "td", + "template", + "textarea", + "tfoot", + "th", + "thead", + "time", + "title", + "tr", + "track", + "u", + "ul", + "var", + "video", + "wbr", +]); + +function resolvePath(inputPath: string): string { + if (inputPath.startsWith("~/")) { + return path.join(process.env.HOME ?? "", inputPath.slice(2)); + } + return path.resolve(inputPath); +} + +function parseArgs(): Args { + const args = process.argv.slice(2); + const sourceIdx = args.indexOf("--source"); + const slugsIdx = args.indexOf("--slugs"); + const limitIdx = args.indexOf("--limit"); + + const repoRoot = path.resolve(process.cwd()); + const defaultSource = path.join(repoRoot, "ConvexAssets", "books"); + + const sourceRoot = resolvePath(sourceIdx !== -1 ? args[sourceIdx + 1] : defaultSource); + const limit = limitIdx !== -1 ? Number(args[limitIdx + 1]) : 5; + + let slugs: string[] = []; + if (slugsIdx !== -1) { + slugs = args[slugsIdx + 1]?.split(",").map((slug) => slug.trim()) ?? []; + } else { + slugs = fs + .readdirSync(sourceRoot, { withFileTypes: true }) + .filter((entry) => entry.isDirectory()) + .map((entry) => entry.name); + } + + return { sourceRoot, slugs, limit }; +} + +function listHtmlFiles(dir: string): string[] { + if (!fs.existsSync(dir)) return []; + return fs + .readdirSync(dir) + .filter((file) => file.toLowerCase().endsWith(".html")) + .map((file) => path.join(dir, file)); +} + +function normalizeSnippet(text: string): string { + return text.replace(/\s+/g, " ").trim(); +} + +function main() { + const { sourceRoot, slugs, limit } = parseArgs(); + const regex = /<\s*([A-Za-z][A-Za-z0-9-]*)\b/g; + const ignoredTags = new Set(["hgroup"]); + + const missing: string[] = []; + const tagCounts = new Map(); + const tagSamples = new Map>(); + + let totalFiles = 0; + let scannedSlugs = 0; + + for (const slug of slugs) { + const chaptersDir = path.join(sourceRoot, slug, "chapters-source"); + const files = listHtmlFiles(chaptersDir); + if (files.length === 0) { + missing.push(slug); + continue; + } + + scannedSlugs += 1; + totalFiles += files.length; + + for (const file of files) { + const text = fs.readFileSync(file, "utf-8"); + let match: RegExpExecArray | null; + while ((match = regex.exec(text))) { + const tag = match[1].toLowerCase(); + if (HTML_TAGS.has(tag) || ignoredTags.has(tag)) continue; + + tagCounts.set(tag, (tagCounts.get(tag) ?? 0) + 1); + + const samples = tagSamples.get(tag) ?? []; + if (samples.length < limit) { + const start = Math.max(0, match.index - 120); + const end = Math.min(text.length, match.index + 200); + samples.push({ file, snippet: normalizeSnippet(text.slice(start, end)) }); + tagSamples.set(tag, samples); + } + } + } + } + + console.log(`Scanned ${totalFiles} file(s) across ${scannedSlugs} slug(s).`); + if (missing.length > 0) { + console.log(`Skipped ${missing.length} slug(s) without chapters-source:`); + console.log(missing.join(", ")); + } + + if (tagCounts.size === 0) { + console.log("No non-HTML tags found."); + return; + } + + const sortedTags = Array.from(tagCounts.entries()).sort((a, b) => a[0].localeCompare(b[0])); + console.log(`Found ${sortedTags.length} non-HTML tag(s):`); + for (const [tag, count] of sortedTags) { + console.log(`- <${tag}> (${count} match(es))`); + const samples = tagSamples.get(tag) ?? []; + for (const sample of samples) { + console.log(` ${sample.file}`); + console.log(` ${sample.snippet}`); + } + } +} + +if (require.main === module) { + main(); +} diff --git a/apps/pipeline/src/scripts/upload-chapters-source.spec.ts b/apps/pipeline/src/scripts/upload-chapters-source.spec.ts new file mode 100644 index 00000000..47e8dbe3 --- /dev/null +++ b/apps/pipeline/src/scripts/upload-chapters-source.spec.ts @@ -0,0 +1,14 @@ +import { expect, test } from "vitest"; +import { mapFilenameToBasename } from "./upload-chapters-source"; + +test("mapFilenameToBasename for the rewritten-xmls", () => { + const file = "rewritten-paragraphs-for-chapter-1.xml"; + const basename = mapFilenameToBasename(file); + expect(basename).toBe("chapter-1.html"); +}); + +test("mapFilenameToBasename for the chapter-N.html files", () => { + const file = "chapter-1.html"; + const basename = mapFilenameToBasename(file); + expect(basename).toBe("chapter-1.html"); +}); diff --git a/apps/pipeline/src/scripts/upload-chapters-source.ts b/apps/pipeline/src/scripts/upload-chapters-source.ts index 93d9440b..ec067ada 100644 --- a/apps/pipeline/src/scripts/upload-chapters-source.ts +++ b/apps/pipeline/src/scripts/upload-chapters-source.ts @@ -3,6 +3,9 @@ import path from "path"; import { convex } from "../server/convex-client"; import { AdminConvexHttpClient } from "../lib/AdminConvexHttpClient"; import { api } from "@bookgenius/convex/_generated/api"; +import { getChapterTitle } from "src/tools/new-tooling/get-chapter-title"; +import { DOMParser, type Element as XMLElement } from "@xmldom/xmldom"; +import { computeParagraphCount } from "../lib/paragraphCount"; type Args = { bookSlug: string; @@ -49,7 +52,20 @@ function detectContentType(filePath: string): string { } function listHtmlFiles(inputDir: string): string[] { - return fs.readdirSync(inputDir).filter((file) => file.toLowerCase().endsWith(".html")); + const htmlFiles = fs.readdirSync(inputDir).filter((file) => file.toLowerCase().endsWith(".html")); + if (htmlFiles.length >= 1) { + return htmlFiles; + } else { + const xmlFiles = fs + .readdirSync(inputDir) + .filter((f) => f.match(/^rewritten-paragraphs-for-chapter-\d+\.xml$/)); + if (xmlFiles.length >= 1) { + return xmlFiles; + } else { + console.error(`No .html or .xml files found in ${inputDir}`); + process.exit(1); + } + } } async function main() { @@ -74,7 +90,7 @@ async function main() { const files = stat.isDirectory() ? listHtmlFiles(inputPath).map((file) => ({ source: path.join(inputPath, file), - basename: file, + basename: mapFilenameToBasename(file), })) : [{ source: inputPath, basename: basename ?? path.basename(inputPath) }]; @@ -114,6 +130,7 @@ async function main() { } const content = fs.readFileSync(file.source); + const paragraphCount = computeParagraphCount(content.toString("utf-8")); try { await convex.uploadFile({ folderPath, @@ -121,6 +138,16 @@ async function main() { content, contentType: detectContentType(file.source), }); + const bookPath = `books/${bookSlug}`; + await convex.updateChapterMetadata({ + bookPath, + folderPath: `${bookPath}/chapters-source`, + basename: file.basename, + chapterNumber: parseInt(file.basename.split("-")[1], 10), + title: getChapterTitle(parseChapterIntoDom(content.toString("utf-8"))), + paragraphCount, + sourceFormat: "html", + }); console.log(`Uploaded ${file.basename}`); stats.uploaded += 1; } catch (error) { @@ -134,7 +161,30 @@ async function main() { ); } -main().catch((error) => { - console.error("Fatal error:", error); - process.exit(1); -}); +function parseChapterIntoDom(chapter: string): XMLElement { + const parser = new DOMParser(); + const doc = parser.parseFromString(chapter, "text/html"); + const root = doc.documentElement as XMLElement; + return root; +} + +export function mapFilenameToBasename(filename: string): string { + const match = filename.match(/^rewritten-paragraphs-for-chapter-(\d+)\.xml$/); + if (match) { + return `chapter-${match[1]}.html`; + } else { + const match = filename.match(/^chapter-(\d+)\.html$/); + if (match) { + return `chapter-${match[1]}.html`; + } else { + throw new Error(`Invalid filename: ${filename}`); + } + } +} + +if (require.main === module) { + main().catch((error) => { + console.error("Fatal error:", error); + process.exit(1); + }); +} diff --git a/apps/pipeline/src/scripts/upload-fixed-nonplays.ts b/apps/pipeline/src/scripts/upload-fixed-nonplays.ts new file mode 100644 index 00000000..a4a1f74d --- /dev/null +++ b/apps/pipeline/src/scripts/upload-fixed-nonplays.ts @@ -0,0 +1,188 @@ +#!/usr/bin/env bun +import fs from "fs"; +import path from "path"; +import { convex } from "../server/convex-client"; +import { AdminConvexHttpClient } from "../lib/AdminConvexHttpClient"; +import { api } from "@bookgenius/convex/_generated/api"; +import { getChapterTitle } from "src/tools/new-tooling/get-chapter-title"; +import { DOMParser, type Element as XMLElement } from "@xmldom/xmldom"; +import { computeParagraphCount } from "../lib/paragraphCount"; +import { mapFilenameToBasename } from "./upload-chapters-source"; + +type Args = { sourceRoot: string; slugs: string[] | null; dryRun: boolean; allowNew: boolean }; + +const PLAY_SLUGS = new Set([ + "Hamlet", + "Macbeth", + "Midsummer-Nights-Dream", + "Othello", + "Romeo-And-Juliet", + "The-Tempest", + "Romeo-And-Juliet-Small", + "Romeo-And-Juliet-Smaller", +]); + +function resolvePath(inputPath: string): string { + if (inputPath.startsWith("~/")) { + return path.join(process.env.HOME ?? "", inputPath.slice(2)); + } + return path.resolve(inputPath); +} + +function parseArgs(): Args { + const args = process.argv.slice(2); + const sourceIdx = args.indexOf("--source"); + const slugsIdx = args.indexOf("--slugs"); + + const defaultSource = path.join(process.env.TMPDIR ?? "/tmp", "bookgenius-fixed-nonplays"); + + const slugs = + slugsIdx !== -1 ? (args[slugsIdx + 1]?.split(",").map((s) => s.trim()) ?? []) : null; + + return { + sourceRoot: resolvePath(sourceIdx !== -1 ? args[sourceIdx + 1] : defaultSource), + slugs, + dryRun: args.includes("--dry-run"), + allowNew: args.includes("--allow-new"), + }; +} + +function listSlugs(root: string): string[] { + if (!fs.existsSync(root)) return []; + return fs + .readdirSync(root, { withFileTypes: true }) + .filter((entry) => entry.isDirectory()) + .map((entry) => entry.name); +} + +function listHtmlFiles(inputDir: string): string[] { + const htmlFiles = fs.readdirSync(inputDir).filter((file) => file.toLowerCase().endsWith(".html")); + if (htmlFiles.length >= 1) { + return htmlFiles; + } + console.error(`No .html files found in ${inputDir}`); + process.exit(1); +} + +function detectContentType(filePath: string): string { + return filePath.toLowerCase().endsWith(".html") ? "text/html" : "application/octet-stream"; +} + +function parseChapterIntoDom(chapter: string): XMLElement { + const parser = new DOMParser(); + const doc = parser.parseFromString(chapter, "text/html"); + const root = doc.documentElement as XMLElement; + return root; +} + +async function uploadSlug( + slug: string, + inputDir: string, + adminClient: AdminConvexHttpClient, + dryRun: boolean, + allowNew: boolean, +): Promise<{ uploaded: number; skipped: number; missing: number; total: number }> { + const folderPath = `books/${slug}/chapters-source`; + const stats = { uploaded: 0, skipped: 0, missing: 0, total: 0 }; + + const files = listHtmlFiles(inputDir).map((file) => ({ + source: path.join(inputDir, file), + basename: mapFilenameToBasename(file), + })); + + for (const file of files) { + stats.total += 1; + + if (!allowNew) { + const existing = await adminClient.query(api.cli.getAsset, { + folderPath, + basename: file.basename, + }); + if (!existing) { + console.error(`Missing asset in Convex: ${folderPath}/${file.basename}`); + stats.missing += 1; + continue; + } + } + + if (dryRun) { + console.log(`[dry-run] Would upload ${file.source} -> ${folderPath}/${file.basename}`); + stats.skipped += 1; + continue; + } + + const content = fs.readFileSync(file.source); + const paragraphCount = computeParagraphCount(content.toString("utf-8")); + try { + await convex.uploadFile({ + folderPath, + basename: file.basename, + content, + contentType: detectContentType(file.source), + }); + const bookPath = `books/${slug}`; + await convex.updateChapterMetadata({ + bookPath, + folderPath: `${bookPath}/chapters-source`, + basename: file.basename, + chapterNumber: parseInt(file.basename.split("-")[1], 10), + title: getChapterTitle(parseChapterIntoDom(content.toString("utf-8"))), + paragraphCount, + sourceFormat: "html", + }); + console.log(`Uploaded ${slug}/${file.basename}`); + stats.uploaded += 1; + } catch (error) { + const message = error instanceof Error ? error.message : String(error); + console.error(`Failed to upload ${slug}/${file.basename}: ${message}`); + } + } + + return stats; +} + +async function main() { + const { sourceRoot, slugs, dryRun, allowNew } = parseArgs(); + const convexUrl = process.env.CONVEX_URL || process.env.NEXT_PUBLIC_CONVEX_URL; + + if (!convexUrl) { + console.error("Missing CONVEX_URL environment variable"); + process.exit(1); + } + + if (!fs.existsSync(sourceRoot)) { + console.error(`Source root not found: ${sourceRoot}`); + process.exit(1); + } + + const adminClient = new AdminConvexHttpClient(convexUrl); + const targetSlugs = slugs ?? listSlugs(sourceRoot); + const totals = { uploaded: 0, skipped: 0, missing: 0, total: 0 }; + + for (const slug of targetSlugs) { + if (PLAY_SLUGS.has(slug)) continue; + + const inputDir = path.join(sourceRoot, slug, "chapters-source"); + if (!fs.existsSync(inputDir)) { + console.warn(`Skipping ${slug}: missing ${inputDir}`); + continue; + } + + const stats = await uploadSlug(slug, inputDir, adminClient, dryRun, allowNew); + totals.uploaded += stats.uploaded; + totals.skipped += stats.skipped; + totals.missing += stats.missing; + totals.total += stats.total; + } + + console.log( + `Done. Uploaded: ${totals.uploaded}, skipped: ${totals.skipped}, missing: ${totals.missing}, total: ${totals.total}`, + ); +} + +if (require.main === module) { + main().catch((error) => { + console.error("Fatal error:", error); + process.exit(1); + }); +} diff --git a/apps/pipeline/src/server/backfill-paragraph-counts.ts b/apps/pipeline/src/server/backfill-paragraph-counts.ts new file mode 100644 index 00000000..ffecba55 --- /dev/null +++ b/apps/pipeline/src/server/backfill-paragraph-counts.ts @@ -0,0 +1,168 @@ +#!/usr/bin/env bun +/** + * Backfill paragraphCount for chapters-source metadata using player indexing rules. + * + * Usage: + * bun apps/pipeline/src/server/backfill-paragraph-counts.ts [--dry-run] [--limit N] + * bun apps/pipeline/src/server/backfill-paragraph-counts.ts --all [--dry-run] [--limit N] + */ + +import "dotenv/config"; +import { AdminConvexHttpClient } from "../lib/AdminConvexHttpClient"; +import { api } from "@bookgenius/convex/_generated/api"; +import { computeParagraphCount } from "../lib/paragraphCount"; + +type Args = { bookSlug?: string; all: boolean; dryRun: boolean; limit?: number }; + +function parseArgs(): Args { + const args = process.argv.slice(2); + const all = args.includes("--all"); + const dryRun = args.includes("--dry-run"); + const limitIdx = args.indexOf("--limit"); + const limit = limitIdx !== -1 ? Number(args[limitIdx + 1]) : undefined; + let bookSlug: string | undefined; + for (let i = 0; i < args.length; i += 1) { + const arg = args[i]; + if (arg === "--limit") { + i += 1; + continue; + } + if (!arg.startsWith("--") && !bookSlug) { + bookSlug = arg; + } + } + + if (!all && !bookSlug) { + console.error( + "Usage: bun apps/pipeline/src/server/backfill-paragraph-counts.ts [--dry-run] [--limit N]", + ); + console.error( + " or: bun apps/pipeline/src/server/backfill-paragraph-counts.ts --all [--dry-run] [--limit N]", + ); + process.exit(1); + } + + return { bookSlug, all, dryRun, limit: Number.isFinite(limit) ? limit : undefined }; +} + +async function listBookPaths(client: AdminConvexHttpClient): Promise { + const books = await client.query(api.bookQueries.listBooks, {}); + return books.map((b) => b.path); +} + +async function backfillBook( + client: AdminConvexHttpClient, + bookPath: string, + options: { dryRun: boolean; limit?: number }, +): Promise<{ scanned: number; updated: number; skipped: number; failed: number }> { + const chapters = await client.query(api.bookQueries.listHtmlSourceChapters, { bookPath }); + + if (!chapters || chapters.length === 0) { + console.log(`[backfill] ${bookPath}: no chapters-source files found`); + return { scanned: 0, updated: 0, skipped: 0, failed: 0 }; + } + + let scanned = 0; + let updated = 0; + let skipped = 0; + let failed = 0; + + for (const chapter of chapters) { + scanned += 1; + const existingCount = chapter.paragraphCount ?? 0; + if (existingCount > 0) { + skipped += 1; + continue; + } + + if (options.limit !== undefined && updated >= options.limit) { + break; + } + + const result = await client.action(api.cli.getTextContent, { versionId: chapter.versionId }); + + const html = result?.content ?? ""; + if (!html) { + console.warn(`[backfill] ${bookPath}/${chapter.basename}: empty content`); + failed += 1; + continue; + } + + const paragraphCount = computeParagraphCount(html); + + if (options.dryRun) { + console.log(`[dry-run] ${bookPath}/${chapter.basename} -> paragraphCount=${paragraphCount}`); + updated += 1; + continue; + } + + try { + await client.mutation(api.metadata.updateChapterMetadata, { + bookPath, + folderPath: `${bookPath}/chapters-source`, + basename: chapter.basename, + chapterNumber: chapter.chapterNumber, + paragraphCount, + sourceFormat: chapter.sourceFormat ?? "html", + }); + updated += 1; + console.log( + `[backfill] ${bookPath}/${chapter.basename} set paragraphCount=${paragraphCount}`, + ); + } catch (error) { + const message = error instanceof Error ? error.message : String(error); + console.warn(`[backfill] ${bookPath}/${chapter.basename} failed to update: ${message}`); + failed += 1; + } + } + + return { scanned, updated, skipped, failed }; +} + +async function main() { + const { bookSlug, all, dryRun, limit } = parseArgs(); + const convexUrl = process.env.CONVEX_URL || process.env.NEXT_PUBLIC_CONVEX_URL; + + if (!convexUrl) { + console.error("Missing CONVEX_URL environment variable"); + process.exit(1); + } + + const client = new AdminConvexHttpClient(convexUrl); + const bookPaths = all + ? await listBookPaths(client) + : [`books/${bookSlug?.replace(/^books\//, "")}`]; + + let totalScanned = 0; + let totalUpdated = 0; + let totalSkipped = 0; + let totalFailed = 0; + + let remainingLimit = limit; + + for (const bookPath of bookPaths) { + console.log(`\n[backfill] Processing ${bookPath}...`); + const result = await backfillBook(client, bookPath, { dryRun, limit: remainingLimit }); + totalScanned += result.scanned; + totalUpdated += result.updated; + totalSkipped += result.skipped; + totalFailed += result.failed; + + if (remainingLimit !== undefined) { + remainingLimit -= result.updated; + if (remainingLimit <= 0) { + break; + } + } + } + + console.log( + `\n[backfill] Done. scanned=${totalScanned} updated=${totalUpdated} skipped=${totalSkipped} failed=${totalFailed}`, + ); +} + +main().catch((error) => { + const message = error instanceof Error ? error.message : String(error); + console.error(`[backfill] Fatal error: ${message}`); + process.exit(1); +}); diff --git a/apps/pipeline/src/server/clone-book.ts b/apps/pipeline/src/server/clone-book.ts index 34a1e3a1..37e76d0b 100644 --- a/apps/pipeline/src/server/clone-book.ts +++ b/apps/pipeline/src/server/clone-book.ts @@ -18,6 +18,7 @@ import { convex, getCharacterFolders, getChapterXml } from "./convex-client"; import { uploadBookFolder } from "./upload-books-to-r2"; import { AdminConvexHttpClient } from "../lib/AdminConvexHttpClient"; import { api } from "@bookgenius/convex/_generated/api"; +import { computeParagraphCount } from "../lib/paragraphCount"; import "dotenv/config"; import { v4 as uuidv4 } from "uuid"; @@ -182,6 +183,7 @@ async function cloneBook(sourceSlug: string, targetSlug: string): Promise value.status === "done") + .map(([step]) => step as Step); + } fromStep = getNextStep(slug) || undefined; if (fromStep) { console.log(`Auto-detected next step: ${fromStep} (${StepLabels[fromStep]})`); @@ -158,7 +165,7 @@ async function main() { console.log(`\nContinuing pipeline for slug="${slug}" from step: ${fromStep}`); - const job = await startPipeline({ slug, fromStep }); + const job = await startPipeline({ slug, fromStep, completedSteps }); // Poll job state and stream logs let lastLogIndex = 0; diff --git a/apps/pipeline/src/server/convex-client.ts b/apps/pipeline/src/server/convex-client.ts index af9ebb51..b5e4e6a7 100644 --- a/apps/pipeline/src/server/convex-client.ts +++ b/apps/pipeline/src/server/convex-client.ts @@ -67,6 +67,7 @@ export const convex = { basename: string; chapterNumber: number; title?: string; + paragraphCount?: number; sourceFormat?: string; }) { return await client.mutation(api.metadata.updateChapterMetadata, args); diff --git a/apps/pipeline/src/server/fix-chapters-upload.ts b/apps/pipeline/src/server/fix-chapters-upload.ts index 612a9310..61c6bcdf 100644 --- a/apps/pipeline/src/server/fix-chapters-upload.ts +++ b/apps/pipeline/src/server/fix-chapters-upload.ts @@ -12,6 +12,7 @@ import path from "path"; import fs from "fs-extra"; import { convex } from "./convex-client"; import "dotenv/config"; +import { computeParagraphCount } from "../lib/paragraphCount"; async function fixChaptersUpload(bookSlug: string): Promise { const repoRoot = path.resolve(__dirname, "../../"); @@ -45,6 +46,7 @@ async function fixChaptersUpload(bookSlug: string): Promise { const chapterNumber = parseInt(match[1], 10); const filePath = path.join(tempOutput, file); const content = await fs.readFile(filePath, "utf-8"); + const paragraphCount = computeParagraphCount(content); const basename = `chapter-${chapterNumber}.html`; console.log(` Uploading chapter ${chapterNumber}...`); @@ -62,6 +64,7 @@ async function fixChaptersUpload(bookSlug: string): Promise { folderPath: `${bookPath}/chapters-source`, basename, chapterNumber, + paragraphCount, sourceFormat: "html", }); diff --git a/apps/pipeline/src/server/notes-import.test.ts b/apps/pipeline/src/server/notes-import.test.ts new file mode 100644 index 00000000..b8d53b32 --- /dev/null +++ b/apps/pipeline/src/server/notes-import.test.ts @@ -0,0 +1,114 @@ +import { describe, expect, it } from "vitest"; + +import { + buildNotesToUploadFromNoteMap, + collectReferencedNoteIdsByChapter, + normalizeNoteRefId, +} from "./notes-import"; + +describe("pipeline notes import helpers", () => { + describe("normalizeNoteRefId", () => { + it("normalizes numeric and note-/fn- patterns to fnN", () => { + expect(normalizeNoteRefId("1")).toBe("fn1"); + expect(normalizeNoteRefId("note-42")).toBe("fn42"); + expect(normalizeNoteRefId("fn9")).toBe("fn9"); + }); + + it("returns null for unsupported IDs", () => { + expect(normalizeNoteRefId("appendix-1")).toBeNull(); + expect(normalizeNoteRefId("")).toBeNull(); + }); + }); + + describe("collectReferencedNoteIdsByChapter", () => { + it("extracts note refs from both and ", () => { + const richXml = ` +
+ +
+

A 3

+
+
+

B

+
+ +
`; + + const refs = collectReferencedNoteIdsByChapter(richXml); + + expect(refs).toEqual([ + { noteId: "fn3", chapter: 1 }, + { noteId: "fn4", chapter: 2 }, + ]); + }); + + it("handles data-note attribute regardless of attribute order", () => { + const richXml = ` +
+ +
+

15

+
+ +
`; + + const refs = collectReferencedNoteIdsByChapter(richXml); + + expect(refs).toEqual([{ noteId: "fn15", chapter: 7 }]); + }); + + it("deduplicates repeated references and keeps first chapter", () => { + const richXml = ` +
+ +
+

8

+
+
+

8

+
+ +
`; + + const refs = collectReferencedNoteIdsByChapter(richXml); + + expect(refs).toEqual([{ noteId: "fn8", chapter: 2 }]); + }); + }); + + describe("buildNotesToUploadFromNoteMap", () => { + it("builds upload payload by matching rich.xml references with available notes", () => { + const richXml = ` +
+ +

1

+

2

+ +
`; + + const noteMap = new Map([ + ["fn1", "

First

"], + ["fn2", "

Second

"], + ["fn99", "

Orphan

"], + ]); + + const uploads = buildNotesToUploadFromNoteMap({ bookPath: "books/test", richXml, noteMap }); + + expect(uploads).toEqual([ + { bookPath: "books/test", noteId: "fn1", content: "

First

", chapter: 1 }, + { bookPath: "books/test", noteId: "fn2", content: "

Second

", chapter: 2 }, + ]); + }); + + it("returns empty when no references are present", () => { + const uploads = buildNotesToUploadFromNoteMap({ + bookPath: "books/test", + richXml: + '

No refs

', + noteMap: new Map([["fn1", "

First

"]]), + }); + + expect(uploads).toEqual([]); + }); + }); +}); diff --git a/apps/pipeline/src/server/notes-import.ts b/apps/pipeline/src/server/notes-import.ts new file mode 100644 index 00000000..78765577 --- /dev/null +++ b/apps/pipeline/src/server/notes-import.ts @@ -0,0 +1,71 @@ +import { JSDOM } from "jsdom"; + +export interface ReferencedNote { + noteId: string; + chapter: number; +} + +export function normalizeNoteRefId(rawId: string): string | null { + const value = rawId.trim(); + if (!value) return null; + + const fnMatch = value.match(/^fn(\d+)$/i); + if (fnMatch) return `fn${fnMatch[1]}`; + + const noteMatch = value.match(/^note-(\d+)$/i); + if (noteMatch) return `fn${noteMatch[1]}`; + + const numberMatch = value.match(/^(\d+)$/); + if (numberMatch) return `fn${numberMatch[1]}`; + + return null; +} + +export function collectReferencedNoteIdsByChapter(richXml: string): ReferencedNote[] { + const refs: ReferencedNote[] = []; + const seen = new Set(); + + const dom = new JSDOM(richXml, { contentType: "application/xml" }); + const doc = dom.window.document; + + const sections = doc.querySelectorAll("section[data-chapter]"); + for (const section of Array.from(sections)) { + const chapterValue = section.getAttribute("data-chapter"); + const chapter = chapterValue ? parseInt(chapterValue, 10) : NaN; + if (Number.isNaN(chapter)) continue; + + const noteRefs = section.querySelectorAll("a[data-note], note[id]"); + for (const noteRef of Array.from(noteRefs)) { + const isAnchor = noteRef.tagName.toLowerCase() === "a"; + const rawId = isAnchor + ? noteRef.getAttribute("data-note") || "" + : noteRef.getAttribute("id") || ""; + const noteId = normalizeNoteRefId(rawId); + if (!noteId || seen.has(noteId)) continue; + + refs.push({ noteId, chapter }); + seen.add(noteId); + } + } + + return refs; +} + +export function buildNotesToUploadFromNoteMap(args: { + bookPath: string; + richXml: string; + noteMap: Map; +}): { bookPath: string; noteId: string; content: string; chapter: number }[] { + const references = collectReferencedNoteIdsByChapter(args.richXml); + + return references + .map((ref) => { + const content = args.noteMap.get(ref.noteId); + if (!content) return null; + return { bookPath: args.bookPath, noteId: ref.noteId, content, chapter: ref.chapter }; + }) + .filter( + (note): note is { bookPath: string; noteId: string; content: string; chapter: number } => + Boolean(note), + ); +} diff --git a/apps/pipeline/src/server/parallel-scheduler.ts b/apps/pipeline/src/server/parallel-scheduler.ts index dc73177e..068a0081 100644 --- a/apps/pipeline/src/server/parallel-scheduler.ts +++ b/apps/pipeline/src/server/parallel-scheduler.ts @@ -5,6 +5,7 @@ export type StepDependency = { step: Step; deps: Step[] }; export const STEP_DEPENDENCIES: StepDependency[] = [ { step: "import_epub", deps: [] }, { step: "create_settings", deps: ["import_epub"] }, + { step: "upload_figures", deps: ["create_settings"] }, { step: "generate_reference_cards", deps: ["create_settings"] }, { step: "rewrite_paragraphs", deps: ["generate_reference_cards"] }, { step: "generate_graphical_style", deps: ["create_settings"] }, diff --git a/apps/pipeline/src/server/pipeline-progress.ts b/apps/pipeline/src/server/pipeline-progress.ts index 3c05aa56..4e54bed0 100644 --- a/apps/pipeline/src/server/pipeline-progress.ts +++ b/apps/pipeline/src/server/pipeline-progress.ts @@ -22,6 +22,7 @@ export interface PipelineProgress { const STEP_ORDER: Step[] = [ "import_epub", "create_settings", + "upload_figures", "generate_reference_cards", "rewrite_paragraphs", "generate_graphical_style", diff --git a/apps/pipeline/src/server/pipeline.ts b/apps/pipeline/src/server/pipeline.ts index dfaee5dd..3cd4e67d 100644 --- a/apps/pipeline/src/server/pipeline.ts +++ b/apps/pipeline/src/server/pipeline.ts @@ -32,6 +32,7 @@ import { } from "../../src/tools/new-tooling/create-graphical-style"; import { getBookSettings } from "../../src/helpers/getBookSettings"; import { generateTagName } from "../../src/helpers/generateTagName"; +import { computeParagraphCount } from "../lib/paragraphCount"; import { initProgress, markStepStarted, @@ -51,6 +52,7 @@ import { setStyleChoice, } from "./style-selection"; import { STEP_DEPENDENCIES, getReadySteps, createSchedulerState } from "./parallel-scheduler"; +import { buildNotesToUploadFromNoteMap, normalizeNoteRefId } from "./notes-import"; export type StyleSelectionCallback = { onUserStyleSubmitted?: (userStyle: GraphicalStyle | null) => void; @@ -131,6 +133,8 @@ function getContentType(filename: string): string { ".png": "image/png", ".jpg": "image/jpeg", ".jpeg": "image/jpeg", + ".gif": "image/gif", + ".svg": "image/svg+xml", ".webp": "image/webp", ".mp4": "video/mp4", ".webm": "video/webm", @@ -204,6 +208,7 @@ async function uploadChaptersToConvex(job: Job, tempOutputDir: string) { const filePath = path.join(tempOutputDir, file); const content = fs.readFileSync(filePath); const basename = `chapter-${chapterNumber}.html`; + const paragraphCount = computeParagraphCount(content.toString("utf-8")); addLog(job, `Uploading chapter ${chapterNumber} to Convex...`); @@ -212,7 +217,7 @@ async function uploadChaptersToConvex(job: Job, tempOutputDir: string) { folderPath: `${job.bookPath}/chapters-source`, basename, content, - contentType: "application/html", + contentType: "text/html", }); await convex.updateChapterMetadata({ bookPath: job.bookPath, @@ -220,6 +225,7 @@ async function uploadChaptersToConvex(job: Job, tempOutputDir: string) { basename, chapterNumber, title: `Chapter ${chapterNumber}`, + paragraphCount, sourceFormat: "html", }); addLog(job, `✔ Chapter ${chapterNumber} uploaded`); @@ -255,8 +261,16 @@ async function uploadCharactersToConvex( } } + console.log( + "[uploadCharactersToConvex] Characters to process:", + referenceCards.characters.map((c) => c.name), + ); + for (const character of referenceCards.characters) { const characterSlug = generateTagName(character.name).toLowerCase(); + console.log( + `[uploadCharactersToConvex] Processing character: "${character.name}" -> slug: "${characterSlug}"`, + ); const promptEntry = generatedPrompts.characters.find( (p) => generateTagName(p.name).toLowerCase() === characterSlug, ); @@ -273,7 +287,11 @@ async function uploadCharactersToConvex( const avatarExtensions = [".png", ".jpg", ".jpeg", ".webp"]; for (const ext of avatarExtensions) { const avatarPath = path.join(outputDir, "characters", `${characterSlug}${ext}`); - if (fs.existsSync(avatarPath)) { + const fileExists = fs.existsSync(avatarPath); + console.log( + `[uploadCharactersToConvex] Checking avatar: ${avatarPath} - exists: ${fileExists}`, + ); + if (fileExists) { addLog(job, `Uploading avatar for ${character.name}...`); try { const content = fs.readFileSync(avatarPath); @@ -342,81 +360,70 @@ async function uploadBackgroundsToConvex(job: Job, outputDir: string) { // eslint-disable-next-line complexity async function extractAndUploadNotesToConvex(job: Job, inputDir: string) { - const fb2Path = findFb2FilePath(inputDir); - if (!fb2Path) { - addLog(job, `⚠ No FB2 file found, skipping notes extraction`); - return; - } - const richXmlPath = path.join(inputDir, "rich.xml"); if (!fs.existsSync(richXmlPath)) { addLog(job, `⚠ No rich.xml found, skipping notes extraction`); return; } - const fb2Content = fs.readFileSync(fb2Path, "utf-8"); - const fb2Doc = parseFb2Xml(fb2Content); - const notesBody = fb2Doc.querySelector("body[name='notes']"); - - if (!notesBody) { - addLog(job, `No notes section found in FB2`); - return; - } - - const sections = notesBody.querySelectorAll("section"); const noteMap = new Map(); + const fb2Path = findFb2FilePath(inputDir); + if (fb2Path) { + const fb2Content = fs.readFileSync(fb2Path, "utf-8"); + const fb2Doc = parseFb2Xml(fb2Content); + const notesBody = fb2Doc.querySelector("body[name='notes']"); + + if (!notesBody) { + addLog(job, `No notes section found in FB2`); + } else { + const sections = notesBody.querySelectorAll("section"); + for (const section of Array.from(sections)) { + const id = section.getAttribute("id"); + const normalizedId = id ? normalizeNoteRefId(id) : null; + const content = + section + .querySelector("p") + ?.innerHTML?.replace(' xmlns="http://www.gribuser.ru/xml/fictionbook/2.0"', "") || ""; + if (normalizedId && content) { + noteMap.set(normalizedId, content); + } + } + } + } else { + addLog(job, `⚠ No FB2 file found, trying Standard Ebooks notes fallback`); + } - for (const section of Array.from(sections)) { - const id = section.getAttribute("id"); - const content = - section - .querySelector("p") - ?.innerHTML?.replace(' xmlns="http://www.gribuser.ru/xml/fictionbook/2.0"', "") || ""; - if (id && content) { - noteMap.set(id, content); + if (noteMap.size === 0) { + const seNotesPath = path.join(inputDir, "se-notes.json"); + if (fs.existsSync(seNotesPath)) { + try { + const seNotes = JSON.parse(fs.readFileSync(seNotesPath, "utf-8")) as { + noteId: string; + content: string; + }[]; + for (const note of seNotes) { + const normalizedId = normalizeNoteRefId(note.noteId); + if (normalizedId && note.content) { + noteMap.set(normalizedId, note.content); + } + } + if (noteMap.size > 0) { + addLog(job, `Found ${noteMap.size} notes in se-notes.json`); + } + } catch (e) { + const msg = e instanceof Error ? e.message : String(e); + addLog(job, `⚠ Failed to parse se-notes.json: ${msg}`); + } } } if (noteMap.size === 0) { - addLog(job, `No notes found in FB2`); + addLog(job, `No notes found in available note sources`); return; } - addLog(job, `Found ${noteMap.size} notes in FB2`); - const richXml = fs.readFileSync(richXmlPath, "utf-8"); - const notesToUpload: { bookPath: string; noteId: string; content: string; chapter: number }[] = - []; - const usedNoteIds = new Set(); - - // Regex: match
and capture chapter content until next section or end - const chapterRegex = - /]*data-chapter="(\d+)"[^>]*>([\s\S]*?)(?=]*data-chapter="|$)/g; - // Match both and formats - const noteRefRegex = /(?: n.noteId)); const orphaned = noteMap.size - usedNoteIds.size; if (orphaned > 0) { addLog(job, `⚠ ${orphaned} notes not referenced in any chapter`); @@ -456,14 +464,62 @@ async function uploadGraphicalStyleToConvex(job: Job, tempOutputDir: string) { } } +async function uploadFiguresToConvex(job: Job, repoRoot: string) { + const seBookDir = path.join(repoRoot, "standardebooks-data", "books", job.slug); + const metadataPath = path.join(seBookDir, "metadata.json"); + const imagesDir = path.join(seBookDir, "images"); + + if (!fs.existsSync(metadataPath) || !fs.existsSync(imagesDir)) { + addLog(job, "No Standard Ebooks images detected - skipping figure upload"); + return; + } + + const files = fs.readdirSync(imagesDir).filter((f) => /\.(png|jpe?g|gif|svg|webp)$/i.test(f)); + + if (files.length === 0) { + addLog(job, `No figure images found in ${imagesDir}`); + return; + } + + addLog(job, `Uploading ${files.length} Standard Ebooks figures to Convex...`); + + let uploaded = 0; + for (const file of files) { + const filePath = path.join(imagesDir, file); + try { + const content = fs.readFileSync(filePath); + await convex.uploadFile({ + folderPath: `${job.bookPath}/figures`, + basename: file, + content, + contentType: getContentType(file), + }); + uploaded++; + } catch (e) { + const msg = e instanceof Error ? e.message : String(e); + addLog(job, `⚠ Failed to upload figure ${file}: ${msg}`); + } + } + + addLog(job, `✔ Figures uploaded: ${uploaded}/${files.length}`); +} + export async function startPipeline(input: { epubPath?: string; fb2Path?: string; slug?: string; ebookConvertBin?: string; fromStep?: Step; + completedSteps?: Step[]; }) { - const { epubPath, fb2Path, slug: providedSlug, ebookConvertBin, fromStep } = input; + const { + epubPath, + fb2Path, + slug: providedSlug, + ebookConvertBin, + fromStep, + completedSteps, + } = input; const baseName = epubPath ? path.basename(epubPath, path.extname(epubPath)) : null; const slug = providedSlug || slugify(baseName || "book"); const bookPath = `books/${slug}`; @@ -472,6 +528,7 @@ export async function startPipeline(input: { const stepOrder = getStepOrder(); const fromStepIndex = fromStep ? getStepIndex(fromStep) : -1; + const completedStepSet = new Set(completedSteps ?? []); const job: Job = { id: uuidv4(), @@ -483,6 +540,9 @@ export async function startPipeline(input: { logs: [], steps: stepOrder.map((step) => { const stepIndex = getStepIndex(step); + if (completedStepSet.has(step) && step !== fromStep) { + return { step, status: "done" as const }; + } if (fromStepIndex > 0 && stepIndex < fromStepIndex) { return { step, status: "done" as const }; } @@ -502,6 +562,11 @@ export async function startPipeline(input: { const tempOutputDir = path.join(bookRoot, "temporary-output"); const schedulerState = createSchedulerState(); + for (const step of job.steps) { + if (step.status === "done") { + schedulerState.completedSteps.add(step.step); + } + } let referenceCards: NewReferenceCardsResponse; initStyleSelection(bookRoot); @@ -564,6 +629,10 @@ export async function startPipeline(input: { }); }, + upload_figures: async () => { + await uploadFiguresToConvex(job, repoRoot); + }, + generate_reference_cards: async () => { setBookArg(slug); const fileName = "single-summary-per-person.json"; @@ -648,6 +717,28 @@ export async function startPipeline(input: { } else { autoStyle = await createGraphicalStyle(slug, { saveToFile: false }); } + + const isFreeRun = process.env.FREE_RUN === "true"; + if (isFreeRun) { + const FREE_RUN_AVATAR_STYLE = + "Abstract geometric avatar Bauhaus style, simple shapes, limited color palette. Natural look, flat shade."; + const forcedStyle = { ...autoStyle, avatarStyle: FREE_RUN_AVATAR_STYLE }; + + setAutoStyleComplete(bookRoot, forcedStyle); + setStyleChoice(bookRoot, "auto"); + + writeBookFile( + "graphicalStyle.json", + JSON.stringify(forcedStyle, null, 2), + FILE_TYPE.TEMPORARY, + ); + addLog(job, "FREE_RUN enabled - skipping style selection and previews"); + + styleSelectionCallbacks.delete(job.id); + await uploadGraphicalStyleToConvex(job, tempOutputDir); + return; + } + setAutoStyleComplete(bookRoot, autoStyle); addLog(job, "Auto style generated, awaiting user input"); @@ -736,6 +827,10 @@ export async function startPipeline(input: { }, generate_backgrounds: async () => { + if (process.env.FREE_RUN === "true") { + addLog(job, "FREE_RUN enabled - skipping background generation and upload"); + return; + } setBookArg(slug); await generateBackgrounds({}); await uploadBackgroundsToConvex(job, outputDir); @@ -827,7 +922,7 @@ export async function startPipeline(input: { (step) => step !== "complete" && step !== "failed", ); - if (process.env.QUICK_MODE === "true") { + if (process.env.QUICK_MODE === "true" || process.env.FREE_RUN === "true") { const skipSteps: Step[] = [ "make_chapter_summaries", "map_summaries_to_paragraphs", @@ -835,8 +930,6 @@ export async function startPipeline(input: { "upload_answer_server_data", ]; for (const skip of skipSteps) { - const idx = stepsToRun.indexOf(skip); - if (idx !== -1) stepsToRun.splice(idx, 1); schedulerState.completedSteps.add(skip); const s = job.steps.find((x) => x.step === skip); if (s) s.status = "done"; diff --git a/apps/pipeline/src/server/regenerate-missing-avatars.ts b/apps/pipeline/src/server/regenerate-missing-avatars.ts index 799b2caa..11207add 100644 --- a/apps/pipeline/src/server/regenerate-missing-avatars.ts +++ b/apps/pipeline/src/server/regenerate-missing-avatars.ts @@ -161,6 +161,60 @@ async function generateSingleAvatar( } } +/** + * Generate a generic avatar for unknown/minor characters. + * This avatar matches the book's art style but shows a mysterious silhouette + * that can be used for any speaker not in the character list. + */ +async function generateGenericAvatar(bookPath: string, avatarStyle: string): Promise { + const genericPath = `${bookPath}/characters/generic`; + + // Check if generic avatar already exists + try { + const files = await getPublishedFilesInFolder(genericPath); + const hasAvatar = files.some((f) => f.basename === "avatar-large.png"); + if (hasAvatar) { + console.log("✅ Generic avatar already exists, skipping"); + return; + } + } catch { + // Folder doesn't exist yet, that's fine + } + + console.log("📷 Generating generic avatar for unknown characters..."); + + const genericPrompt = `A mysterious figure shown from behind or in silhouette. +No distinct facial features visible. The figure should feel enigmatic and anonymous, +suitable for representing any unnamed or minor character. +Atmospheric lighting with the figure partially obscured by shadow or mist.`; + + try { + const generator = + process.env.FREE_RUN === "true" + ? generateCharacterImageWithFlux + : generateCharacterImageWithOpenAI; + const imageBuffer = await generator(genericPrompt, "Unknown Character", avatarStyle); + + if (!imageBuffer) { + console.error("❌ Failed to generate generic avatar"); + return; + } + + console.log("📤 Uploading generic avatar..."); + await convex.uploadFile({ + folderPath: genericPath, + basename: "avatar-large.png", + content: imageBuffer, + contentType: "image/png", + }); + + console.log("✅ Successfully generated and uploaded generic avatar"); + } catch (e) { + const errorMsg = e instanceof Error ? e.message : String(e); + console.error("❌ Error generating generic avatar:", errorMsg); + } +} + async function regenerateMissingAvatars(bookPath: string, avatarStyle: string): Promise { const { missingLarge } = await findCharactersMissingAvatars(bookPath); @@ -246,6 +300,9 @@ async function main() { } await regenerateMissingAvatars(bookPath, styleData.avatarStyle); + + // Generate a generic avatar for unknown/minor characters + await generateGenericAvatar(bookPath, styleData.avatarStyle); } main().catch((e) => { diff --git a/apps/pipeline/src/server/router.ts b/apps/pipeline/src/server/router.ts index cbbc036b..2d75b0e8 100644 --- a/apps/pipeline/src/server/router.ts +++ b/apps/pipeline/src/server/router.ts @@ -603,6 +603,10 @@ export const appRouter = router({ submitStyleDescription: procedure .input(z.object({ jobId: z.string(), description: z.string().nullable() })) .mutation(async ({ input }) => { + if (process.env.FREE_RUN === "true") { + throw new Error("Style selection disabled when FREE_RUN=true"); + } + const job = jobs.get(input.jobId); if (!job) throw new Error("Job not found"); @@ -647,6 +651,10 @@ export const appRouter = router({ chooseStyle: procedure .input(z.object({ jobId: z.string(), choice: z.enum(["auto", "user"]) })) .mutation(async ({ input }) => { + if (process.env.FREE_RUN === "true") { + throw new Error("Style selection disabled when FREE_RUN=true"); + } + const job = jobs.get(input.jobId); if (!job) throw new Error("Job not found"); @@ -677,6 +685,10 @@ export const appRouter = router({ .input(z.object({ jobId: z.string() })) // eslint-disable-next-line complexity -- multiple file operations and style processing steps .mutation(async ({ input }) => { + if (process.env.FREE_RUN === "true") { + throw new Error("Style previews disabled when FREE_RUN=true"); + } + const job = jobs.get(input.jobId); if (!job) throw new Error("Job not found"); diff --git a/apps/pipeline/src/server/style-selection.ts b/apps/pipeline/src/server/style-selection.ts index 3bfdf9a8..0165fcdb 100644 --- a/apps/pipeline/src/server/style-selection.ts +++ b/apps/pipeline/src/server/style-selection.ts @@ -49,13 +49,14 @@ function getStyleSelectionPath(bookRoot: string): string { export function initStyleSelection(bookRoot: string): StyleSelectionState { const now = Date.now(); + const isFreeRun = process.env.FREE_RUN === "true"; const state: StyleSelectionState = { - status: "generating_auto_style", + status: isFreeRun ? "complete" : "generating_auto_style", autoStyle: null, userPrompt: null, userStyle: null, previews: null, - selected: null, + selected: isFreeRun ? "auto" : null, timeoutAt: null, startedAt: now, updatedAt: now, diff --git a/apps/pipeline/src/server/upload-chapters.ts b/apps/pipeline/src/server/upload-chapters.ts deleted file mode 100644 index 7e02bd81..00000000 --- a/apps/pipeline/src/server/upload-chapters.ts +++ /dev/null @@ -1,69 +0,0 @@ -import fs from "fs"; -import path from "path"; -import { convex } from "./convex-client"; - -async function uploadChapters(bookSlug: string) { - const repoRoot = path.resolve(__dirname, "../../"); - const bookRoot = path.join(repoRoot, "books-data", bookSlug); - const tempOutputDir = path.join(bookRoot, "temporary-output"); - const bookPath = `books/${bookSlug}`; - - if (!fs.existsSync(tempOutputDir)) { - console.error(`Directory not found: ${tempOutputDir}`); - process.exit(1); - } - - const files = fs - .readdirSync(tempOutputDir) - .filter((f) => f.match(/^rewritten-paragraphs-for-chapter-\d+\.xml$/)); - - if (files.length === 0) { - console.log("No chapter files found to upload"); - return; - } - - console.log(`Found ${files.length} chapters to upload`); - - for (const file of files) { - const match = file.match(/chapter-(\d+)/); - if (!match) continue; - - const chapterNumber = parseInt(match[1], 10); - const filePath = path.join(tempOutputDir, file); - const content = fs.readFileSync(filePath); - const basename = `chapter-${chapterNumber}.html`; - - console.log(`Uploading chapter ${chapterNumber}...`); - - try { - await convex.uploadFile({ - folderPath: `${bookPath}/chapters-source`, - basename, - content, - contentType: "text/html", - }); - await convex.updateChapterMetadata({ - bookPath, - folderPath: `${bookPath}/chapters-source`, - basename, - chapterNumber, - title: `Chapter ${chapterNumber}`, - sourceFormat: "html", - }); - console.log(`✔ Chapter ${chapterNumber} uploaded`); - } catch (e) { - const msg = e instanceof Error ? e.message : String(e); - console.error(`✖ Failed to upload chapter ${chapterNumber}: ${msg}`); - } - } - - console.log("Done!"); -} - -const slug = process.argv[2]; -if (!slug) { - console.error("Usage: tsx upload-chapters.ts "); - process.exit(1); -} - -uploadChapters(slug); diff --git a/apps/pipeline/src/server/upload-figures-cli.ts b/apps/pipeline/src/server/upload-figures-cli.ts new file mode 100644 index 00000000..dabc479b --- /dev/null +++ b/apps/pipeline/src/server/upload-figures-cli.ts @@ -0,0 +1,107 @@ +#!/usr/bin/env tsx +import path from "path"; +import fs from "fs"; +import dotenv from "dotenv"; +import { convex } from "./convex-client"; + +dotenv.config(); + +function getRepoRoot(): string { + return path.resolve(__dirname, "../../"); +} + +function getContentType(filename: string): string { + const ext = path.extname(filename).toLowerCase(); + const types: Record = { + ".png": "image/png", + ".jpg": "image/jpeg", + ".jpeg": "image/jpeg", + ".gif": "image/gif", + ".svg": "image/svg+xml", + ".webp": "image/webp", + }; + return types[ext] || "application/octet-stream"; +} + +function usage() { + console.log("Usage: bun src/server/upload-figures-cli.ts [--source ]"); + console.log("Examples:"); + console.log( + " bun src/server/upload-figures-cli.ts agatha-christie_the-mysterious-affair-at-styles", + ); + console.log( + " bun src/server/upload-figures-cli.ts agatha-christie_the-mysterious-affair-at-styles-openai --source agatha-christie_the-mysterious-affair-at-styles", + ); +} + +async function main() { + const args = process.argv.slice(2); + const targetSlug = args[0]; + + if (!targetSlug) { + usage(); + process.exit(1); + } + + const sourceFlagIndex = args.findIndex((arg) => arg === "--source" || arg === "--from"); + const sourceSlug = + sourceFlagIndex !== -1 && args[sourceFlagIndex + 1] ? args[sourceFlagIndex + 1] : targetSlug; + + const repoRoot = getRepoRoot(); + const seBookDir = path.join(repoRoot, "standardebooks-data", "books", sourceSlug); + const metadataPath = path.join(seBookDir, "metadata.json"); + const imagesDir = path.join(seBookDir, "images"); + + if (!fs.existsSync(metadataPath)) { + console.error(`Standard Ebooks metadata not found: ${metadataPath}`); + process.exit(1); + } + + if (!fs.existsSync(imagesDir)) { + console.error(`Images directory not found: ${imagesDir}`); + process.exit(1); + } + + const files = fs.readdirSync(imagesDir).filter((f) => /\.(png|jpe?g|gif|svg|webp)$/i.test(f)); + + if (files.length === 0) { + console.log(`No figure images found in ${imagesDir}`); + process.exit(0); + } + + console.log( + `Uploading ${files.length} figures from "${sourceSlug}" to books/${targetSlug}/figures...`, + ); + + let uploaded = 0; + let failed = 0; + + for (const file of files) { + const filePath = path.join(imagesDir, file); + try { + const content = fs.readFileSync(filePath); + await convex.uploadFile({ + folderPath: `books/${targetSlug}/figures`, + basename: file, + content, + contentType: getContentType(file), + }); + uploaded++; + console.log(`✔ ${file}`); + } catch (e) { + failed++; + const msg = e instanceof Error ? e.message : String(e); + console.error(`✖ ${file}: ${msg}`); + } + } + + console.log(`Done. Uploaded ${uploaded}/${files.length} figures.`); + if (failed > 0) { + process.exit(1); + } +} + +main().catch((err) => { + console.error(err); + process.exit(1); +}); diff --git a/apps/pipeline/src/shared/pipelineTypes.ts b/apps/pipeline/src/shared/pipelineTypes.ts index affc1417..e7adfd0a 100644 --- a/apps/pipeline/src/shared/pipelineTypes.ts +++ b/apps/pipeline/src/shared/pipelineTypes.ts @@ -40,6 +40,7 @@ export type StyleSelectionState = z.infer; export const StepEnum = z.enum([ "import_epub", "create_settings", + "upload_figures", "generate_reference_cards", "rewrite_paragraphs", "generate_graphical_style", @@ -59,6 +60,7 @@ export type Step = z.infer; export const StepLabels: Record = { import_epub: "Import EPUB", create_settings: "Create Settings", + upload_figures: "Upload Figures", generate_reference_cards: "Generate Reference Cards", rewrite_paragraphs: "Rewrite Paragraphs", generate_graphical_style: "Generate Graphical Style", diff --git a/apps/pipeline/src/tools/NewRewriteParagraphsPromptBook.md b/apps/pipeline/src/tools/NewRewriteParagraphsPromptBook.md index 25e4842c..d9e549e9 100644 --- a/apps/pipeline/src/tools/NewRewriteParagraphsPromptBook.md +++ b/apps/pipeline/src/tools/NewRewriteParagraphsPromptBook.md @@ -34,6 +34,24 @@ Identify mentions of the characters within the text. - **Flexibility:** Match names even if they appear in different grammatical cases (e.g., Polish declensions like "Winstona", "Winstonowi") or possessives (English "Winston's") or when referenced by title ("General") - but only if its a clear reference to the character. - **Structure:** `Mentioned Name` +## 3. Unknown Character Speakers + +When dialogue is spoken by a character **NOT in the Characters List**: + +- **Tag their SPEECH ONLY** - add `data-speaker` attribute to the paragraph +- **DO NOT tag their mentions** - no `data-c` spans for unknown characters +- **Generate a descriptive slug** based on how the text refers to them or their observable traits + +### Slug Guidelines for Unknown Characters: + +- Keep descriptions concise but uniquely identifying (2-5 words) +- Use observable traits: role, appearance, location, action +- Be specific enough to differentiate similar characters (e.g., two soldiers → `tall-soldier-at-gate` vs `wounded-soldier`) + +**Good Examples:** `tall-soldier-at-gate`, `old-woman-selling-bread`, `gruff-innkeeper`, `the-nurse` + +**Bad Examples:** `person` (too generic), `speaker` (not descriptive), `character-1` (meaningless), `soldier` (too generic) + # Constraints (CRITICAL) 1. **Text Invariance:** The visible text inside the tags must remain **EXACTLY** the same as the input. Do not fix grammar, do not correct spelling, do not remove archaic words. @@ -63,8 +81,13 @@ Identify mentions of the characters within the text. **Output HTML:** ```html -

Książę spojrzał na Sarę, a jego wzrok złagodniał.

-

— Panie mój — wyszeptała Sara — twe słowa są jak światło.

+

+ Książę spojrzał na Sarę, a jego + wzrok złagodniał. +

+

+ — Panie mój — wyszeptała Sara — twe słowa są jak światło. +

``` ## Example 2: English (Quotes & Formatting) @@ -92,7 +115,9 @@ Identify mentions of the characters within the text. 'But they were in the well,' Alice said to the Dormouse, ignoring the remark.

-

'Of course they were', said the Dormouse; '—well in.'

+

+ 'Of course they were', said the Dormouse; '—well in.' +

``` ## Example 3: Multiple Speakers (Edge Case) @@ -121,6 +146,30 @@ Identify mentions of the characters within the text.

``` +## Example 4: Unknown Character Speaker + +**Characters (JSON):** + +```json +[{ "id": "winston", "name": "Winston", "desc": "Protagonist" }] +``` + +**Input HTML:** + +```html +

"Stand back!" shouted the tall soldier at the gate.

+

Winston obeyed silently.

+``` + +**Output HTML:** + +```html +

"Stand back!" shouted the tall soldier at the gate.

+

Winston obeyed silently.

+``` + +Note: The soldier gets `data-speaker` with a descriptive slug, but is NOT wrapped in `data-c` because they're not in the Characters List. + --- ## Important reminder diff --git a/apps/pipeline/src/tools/NewRewriteParagraphsPromptBookChunked.md b/apps/pipeline/src/tools/NewRewriteParagraphsPromptBookChunked.md index bea6bbe8..d995eb3e 100644 --- a/apps/pipeline/src/tools/NewRewriteParagraphsPromptBookChunked.md +++ b/apps/pipeline/src/tools/NewRewriteParagraphsPromptBookChunked.md @@ -34,6 +34,24 @@ Identify mentions of the characters within the text. - **Flexibility:** Match names even if they appear in different grammatical cases (e.g., Polish declensions like "Winstona", "Winstonowi") or possessives (English "Winston's") or when referenced by title ("General") - but only if its a clear reference to the character. - **Structure:** `Mentioned Name` +## 3. Unknown Character Speakers + +When dialogue is spoken by a character **NOT in the Characters List**: + +- **Tag their SPEECH ONLY** - add `data-speaker` attribute to the paragraph +- **DO NOT tag their mentions** - no `data-c` spans for unknown characters +- **Generate a descriptive slug** based on how the text refers to them or their observable traits + +### Slug Guidelines for Unknown Characters: + +- Keep descriptions concise but uniquely identifying (2-5 words) +- Use observable traits: role, appearance, location, action +- Be specific enough to differentiate similar characters (e.g., two soldiers → `tall-soldier-at-gate` vs `wounded-soldier`) + +**Good Examples:** `tall-soldier-at-gate`, `old-woman-selling-bread`, `gruff-innkeeper`, `the-nurse` + +**Bad Examples:** `person` (too generic), `speaker` (not descriptive), `character-1` (meaningless), `soldier` (too generic) + # Constraints (CRITICAL) 1. **Text Invariance:** The visible text inside the tags must remain **EXACTLY** the same as the input. Do not fix grammar, do not correct spelling, do not remove archaic words. @@ -63,8 +81,13 @@ Identify mentions of the characters within the text. **Output HTML:** ```html -

Książę spojrzał na Sarę, a jego wzrok złagodniał.

-

— Panie mój — wyszeptała Sara — twe słowa są jak światło.

+

+ Książę spojrzał na Sarę, a jego + wzrok złagodniał. +

+

+ — Panie mój — wyszeptała Sara — twe słowa są jak światło. +

``` ## Example 2: English (Quotes & Formatting) @@ -92,7 +115,9 @@ Identify mentions of the characters within the text. 'But they were in the well,' Alice said to the Dormouse, ignoring the remark.

-

'Of course they were', said the Dormouse; '—well in.'

+

+ 'Of course they were', said the Dormouse; '—well in.' +

``` ## Example 3: Multiple Speakers (Edge Case) @@ -121,6 +146,30 @@ Identify mentions of the characters within the text.

``` +## Example 4: Unknown Character Speaker + +**Characters (JSON):** + +```json +[{ "id": "winston", "name": "Winston", "desc": "Protagonist" }] +``` + +**Input HTML:** + +```html +

"Stand back!" shouted the tall soldier at the gate.

+

Winston obeyed silently.

+``` + +**Output HTML:** + +```html +

"Stand back!" shouted the tall soldier at the gate.

+

Winston obeyed silently.

+``` + +Note: The soldier gets `data-speaker` with a descriptive slug, but is NOT wrapped in `data-c` because they're not in the Characters List. + --- ## Important reminder diff --git a/apps/pipeline/src/tools/chapterChunker.spec.ts b/apps/pipeline/src/tools/chapterChunker.spec.ts new file mode 100644 index 00000000..3ccf6672 --- /dev/null +++ b/apps/pipeline/src/tools/chapterChunker.spec.ts @@ -0,0 +1,42 @@ +import { describe, it, expect } from "vitest"; +import { buildParagraphXml, type Paragraph } from "./chapterChunker"; + +describe("buildParagraphXml", () => { + it("renders attributes and escapes quotes inside attribute values", () => { + const paragraph: Paragraph = { + elementType: "p", + dataIndex: 1, + text: "Hello world", + attributes: { "data-title": 'A "quoted" title' }, + }; + + const output = buildParagraphXml(paragraph); + + expect(output).toBe('

Hello world

'); + }); + + it("preserves inner HTML tags in the paragraph text", () => { + const paragraph: Paragraph = { + elementType: "p", + dataIndex: 2, + text: 'Hello Mary.', + }; + + const output = buildParagraphXml(paragraph); + + expect(output).toContain("Mary"); + }); + + it("keeps double-quoted attributes inside embedded HTML", () => { + const paragraph: Paragraph = { + elementType: "figure", + dataIndex: 3, + text: 'Mrs. Inglethorp\'s bedroom', + }; + + const output = buildParagraphXml(paragraph); + + expect(output).toContain('alt="Mrs. Inglethorp\'s bedroom"'); + }); +}); diff --git a/apps/pipeline/src/tools/chapterChunker.ts b/apps/pipeline/src/tools/chapterChunker.ts index bd4fadd6..b6fda21b 100644 --- a/apps/pipeline/src/tools/chapterChunker.ts +++ b/apps/pipeline/src/tools/chapterChunker.ts @@ -111,7 +111,7 @@ function buildAttributeString(attributes?: Record): string { export function buildParagraphXml(p: Paragraph): string { const attrs = buildAttributeString(p.attributes); - return `<${p.elementType}${attrs}>${p.text.trim().replace(/"/g, "'")}`; + return `<${p.elementType}${attrs}>${p.text.trim()}`; } export function buildChunkXml(paragraphs: Paragraph[]): string { diff --git a/apps/pipeline/src/tools/fix-legacy-play-custom-tags.spec.ts b/apps/pipeline/src/tools/fix-legacy-play-custom-tags.spec.ts new file mode 100644 index 00000000..460ef7ea --- /dev/null +++ b/apps/pipeline/src/tools/fix-legacy-play-custom-tags.spec.ts @@ -0,0 +1,70 @@ +import { describe, expect, it } from "vitest"; +import { JSDOM } from "jsdom"; +import { fixLegacyPlayCustomTags } from "./fix-legacy-play-custom-tags"; + +function parseSection(html: string): Element { + const dom = new JSDOM(html); + const doc = dom.window.document; + const section = doc.querySelector("section[data-chapter]"); + if (!section) { + throw new Error("Missing section[data-chapter] in test input"); + } + return section; +} + +describe("fixLegacyPlayCustomTags", () => { + it("converts non-HTML tags outside em to spans with data-c", () => { + const input = ` +
+

HAMLET.

+

O dear Ophelia, I am ill at these numbers.

+
+ `; + + const result = fixLegacyPlayCustomTags(input); + const section = parseSection(result); + + const hamlet = section.querySelector('span[data-c="hamlet"]'); + const ophelia = section.querySelector('span[data-c="ophelia"]'); + + expect(hamlet?.textContent).toBe("HAMLET"); + expect(ophelia?.textContent).toBe("Ophelia"); + + expect(section.querySelector("hamlet")).toBeNull(); + expect(section.querySelector("ophelia")).toBeNull(); + }); + + it("preserves custom attributes and maps enters/exits to data-* while dropping raw attrs", () => { + const input = ` +
+

House of Capulet.

+
+ `; + + const result = fixLegacyPlayCustomTags(input); + const section = parseSection(result); + + const capulet = section.querySelector( + 'span[data-c="capulet"][data-enters="true"][dynasty="true"]', + ); + + expect(capulet?.textContent).toBe("Capulet"); + expect(capulet?.hasAttribute("enters")).toBe(false); + expect(capulet?.hasAttribute("exits")).toBe(false); + }); + + it("does not convert valid HTML tags like cite", () => { + const input = ` +
+

Source: Some Book.

+
+ `; + + const result = fixLegacyPlayCustomTags(input); + const section = parseSection(result); + const cite = section.querySelector("cite"); + + expect(cite?.textContent).toBe("Some Book"); + expect(section.querySelector('span[data-c="cite"]')).toBeNull(); + }); +}); diff --git a/apps/pipeline/src/tools/fix-legacy-play-custom-tags.ts b/apps/pipeline/src/tools/fix-legacy-play-custom-tags.ts new file mode 100644 index 00000000..f1bdef9b --- /dev/null +++ b/apps/pipeline/src/tools/fix-legacy-play-custom-tags.ts @@ -0,0 +1,168 @@ +import { JSDOM } from "jsdom"; + +const HTML_TAGS = new Set([ + "a", + "abbr", + "address", + "article", + "aside", + "audio", + "b", + "bdi", + "bdo", + "blockquote", + "body", + "br", + "button", + "canvas", + "caption", + "cite", + "code", + "col", + "colgroup", + "data", + "datalist", + "dd", + "del", + "details", + "dfn", + "dialog", + "div", + "dl", + "dt", + "em", + "fieldset", + "figcaption", + "figure", + "footer", + "form", + "h1", + "h2", + "h3", + "h4", + "h5", + "h6", + "head", + "header", + "hr", + "html", + "i", + "img", + "input", + "ins", + "kbd", + "label", + "legend", + "li", + "link", + "main", + "map", + "mark", + "meta", + "meter", + "nav", + "noscript", + "object", + "ol", + "optgroup", + "option", + "output", + "p", + "param", + "picture", + "pre", + "progress", + "q", + "rp", + "rt", + "ruby", + "s", + "samp", + "script", + "section", + "select", + "small", + "source", + "span", + "strong", + "style", + "sub", + "summary", + "sup", + "table", + "tbody", + "td", + "template", + "textarea", + "tfoot", + "th", + "thead", + "time", + "title", + "tr", + "track", + "u", + "ul", + "var", + "video", + "wbr", +]); + +function slugifyTag(tagName: string): string { + return tagName + .toLowerCase() + .replace(/[^a-z0-9]+/g, "-") + .replace(/^-+|-+$/g, ""); +} + +function copyAttributes(target: Element, source: Element, skip: Set): void { + for (const attr of Array.from(source.attributes)) { + const name = attr.name.toLowerCase(); + if (skip.has(name)) continue; + target.setAttribute(attr.name, attr.value); + } +} + +export function fixLegacyPlayCustomTags(html: string): string { + const dom = new JSDOM(html); + const doc = dom.window.document; + let didFix = false; + + const allElements = Array.from(doc.querySelectorAll("*")); + for (const element of allElements) { + const tagName = element.tagName.toLowerCase(); + if (HTML_TAGS.has(tagName)) continue; + + const slug = slugifyTag(tagName); + if (!slug) continue; + + const span = doc.createElement("span"); + span.setAttribute("data-c", slug); + + const entersValue = element.getAttribute("enters") ?? element.getAttribute("data-enters"); + const exitsValue = element.getAttribute("exits") ?? element.getAttribute("data-exits"); + + copyAttributes( + span, + element, + new Set(["enters", "exits", "data-enters", "data-exits", "talking"]), + ); + + if (entersValue !== null) { + span.setAttribute("data-enters", entersValue || "true"); + } + if (exitsValue !== null) { + span.setAttribute("data-exits", exitsValue || "true"); + } + + const text = (element.textContent || "").trim(); + if (text.length > 0) { + span.textContent = text; + } + + element.replaceWith(span); + didFix = true; + } + + return didFix ? doc.body.innerHTML : html; +} diff --git a/apps/pipeline/src/tools/fix-legacy-play-didaskalia.spec.ts b/apps/pipeline/src/tools/fix-legacy-play-didaskalia.spec.ts new file mode 100644 index 00000000..b3c3b24d --- /dev/null +++ b/apps/pipeline/src/tools/fix-legacy-play-didaskalia.spec.ts @@ -0,0 +1,94 @@ +import { describe, expect, it } from "vitest"; +import { JSDOM } from "jsdom"; +import { fixLegacyPlayDidaskalia } from "./fix-legacy-play-didaskalia"; + +function parseSection(html: string): Element { + const dom = new JSDOM(html); + const doc = dom.window.document; + const section = doc.querySelector("section[data-chapter]"); + if (!section) { + throw new Error("Missing section[data-chapter] in test input"); + } + return section; +} + +describe("fixLegacyPlayDidaskalia", () => { + it("moves didaskalia and narration into current speaker until next speaker block", () => { + const input = ` +
+
+

So well thy words become thee as thy wounds;

+

They smack of honour both. Go get him surgeons.

+
+

Exit Sergeant, attended

+

Who comes here?

+

Enter ROSS

+
+

The worthy thane of Ross.

+
+
+ `; + + const result = fixLegacyPlayDidaskalia(input); + const section = parseSection(result); + const duncan = section.querySelector('div[data-speaker="duncan"]'); + expect(duncan).toBeTruthy(); + const duncanPs = duncan?.querySelectorAll("p") ?? []; + expect(duncanPs.length).toBe(4); + expect(duncanPs[2]?.textContent).toContain("Exit Sergeant"); + expect(duncanPs[3]?.textContent).toContain("Who comes here?"); + + const duncanNext = duncan?.nextElementSibling; + expect(duncanNext?.tagName.toLowerCase()).toBe("p"); + expect(duncanNext?.textContent).toContain("Enter ROSS"); + }); + + it("keeps didaskalia outside when it precedes the next speaker block", () => { + const input = ` +
+
+

So well thy words become thee as thy wounds;

+

They smack of honour both. Go get him surgeons.

+
+

Enter ROSS

+
+

The worthy thane of Ross.

+
+
+ `; + + const result = fixLegacyPlayDidaskalia(input); + const section = parseSection(result); + const duncan = section.querySelector('div[data-speaker="duncan"]'); + expect(duncan).toBeTruthy(); + const duncanPs = duncan?.querySelectorAll("p") ?? []; + expect(duncanPs.length).toBe(2); + + const duncanNext = duncan?.nextElementSibling; + expect(duncanNext?.tagName.toLowerCase()).toBe("p"); + expect(duncanNext?.textContent).toContain("Enter ROSS"); + }); + + it("keeps trailing didaskalia outside when it is the last element", () => { + const input = ` +
+
+

How goes the night, boy?

+
+

Exit

+
+ `; + + const result = fixLegacyPlayDidaskalia(input); + const section = parseSection(result); + const macbeth = section.querySelector('div[data-speaker="macbeth"]'); + expect(macbeth).toBeTruthy(); + const macbethPs = macbeth?.querySelectorAll("p") ?? []; + expect(macbethPs.length).toBe(1); + + const next = macbeth?.nextElementSibling; + expect(next?.tagName.toLowerCase()).toBe("p"); + expect(next?.getAttribute("data-is-didaskalia")).toBe("true"); + expect(next?.textContent).toContain("Exit"); + }); +}); diff --git a/apps/pipeline/src/tools/fix-legacy-play-didaskalia.ts b/apps/pipeline/src/tools/fix-legacy-play-didaskalia.ts new file mode 100644 index 00000000..f64c7a67 --- /dev/null +++ b/apps/pipeline/src/tools/fix-legacy-play-didaskalia.ts @@ -0,0 +1,120 @@ +import { JSDOM } from "jsdom"; + +const BOUNDARY_TAGS = new Set(["h1", "h2", "h3", "h4", "h5", "h6"]); + +function isSpeakerBlock(element: Element): boolean { + if (element.tagName.toLowerCase() !== "div") return false; + if (!element.hasAttribute("data-speaker")) return false; + return ( + element.hasAttribute("data-label") || element.querySelector("[data-speaker-label]") !== null + ); +} + +function hasNonWhitespaceText(nodes: ChildNode[]): boolean { + return nodes.some((node) => node.nodeType === 3 && node.textContent?.trim()); +} + +function onlyElementChild(element: Element): Element | null { + const children = Array.from(element.children); + if (children.length !== 1) return null; + return children[0]; +} + +function isPureEmParagraph(p: Element): boolean { + if (hasNonWhitespaceText(Array.from(p.childNodes))) return false; + + const directChild = onlyElementChild(p); + if (!directChild) return false; + if (directChild.tagName.toLowerCase() === "em") return true; + + if (directChild.tagName.toLowerCase() !== "span") return false; + if (hasNonWhitespaceText(Array.from(directChild.childNodes))) return false; + + const spanChild = onlyElementChild(directChild); + return spanChild?.tagName.toLowerCase() === "em"; +} + +function isDidaskaliaParagraph(element: Element): boolean { + if (element.tagName.toLowerCase() !== "p") return false; + if (element.getAttribute("data-is-didaskalia") === "true") return true; + return isPureEmParagraph(element); +} + +function isBoundaryElement(element: Element): boolean { + const tagName = element.tagName.toLowerCase(); + return BOUNDARY_TAGS.has(tagName) || tagName === "section"; +} + +function findNextNonDidaskaliaSibling(start: Element): Element | null { + let next = start.nextElementSibling; + while (next && isDidaskaliaParagraph(next)) { + next = next.nextElementSibling; + } + return next; +} + +function isMovableParagraph(element: Element): boolean { + return element.tagName.toLowerCase() === "p" && !element.hasAttribute("data-speaker"); +} + +function fixSection(section: Element): boolean { + let didFix = false; + let node: Element | null = section.firstElementChild; + + while (node) { + if (isSpeakerBlock(node)) { + let cursor = node.nextElementSibling; + + while (cursor) { + if (isSpeakerBlock(cursor) || isBoundaryElement(cursor)) { + break; + } + + if (isDidaskaliaParagraph(cursor)) { + const nextNonDidaskalia = findNextNonDidaskaliaSibling(cursor); + if (!nextNonDidaskalia || isSpeakerBlock(nextNonDidaskalia)) { + break; + } + const toMove = cursor; + cursor = cursor.nextElementSibling; + node.appendChild(toMove); + didFix = true; + continue; + } + + if (isMovableParagraph(cursor)) { + const toMove = cursor; + cursor = cursor.nextElementSibling; + node.appendChild(toMove); + didFix = true; + continue; + } + + break; + } + } + + node = node.nextElementSibling; + } + + return didFix; +} + +export function fixLegacyPlayDidaskalia(html: string): string { + const dom = new JSDOM(html); + const doc = dom.window.document; + const sections = Array.from(doc.querySelectorAll("section[data-chapter]")); + const containers = sections.length ? sections : [doc.body]; + + let didFix = false; + for (const section of containers) { + if (!section.querySelector("div[data-speaker][data-label]")) { + continue; + } + if (fixSection(section)) { + didFix = true; + } + } + + return didFix ? doc.body.innerHTML : html; +} diff --git a/apps/pipeline/src/tools/fix-legacy-play-multi-speaker.spec.ts b/apps/pipeline/src/tools/fix-legacy-play-multi-speaker.spec.ts new file mode 100644 index 00000000..c519e4c6 --- /dev/null +++ b/apps/pipeline/src/tools/fix-legacy-play-multi-speaker.spec.ts @@ -0,0 +1,39 @@ +import { describe, expect, it } from "vitest"; +import { + applyMultiSpeakerMapToHtml, + extractMultiSpeakerNextLineMapFromXml, +} from "./fix-legacy-play-multi-speaker"; + +describe("extractMultiSpeakerNextLineMapFromXml", () => { + it("maps the next line id to multiple speaker slugs", () => { + const xml = ` + +

ALL

+

Fair is foul, and foul is fair:

+
+ `; + + const map = extractMultiSpeakerNextLineMapFromXml(xml); + expect(map.get("ch1-p22-s1")).toEqual(["first-witch", "second-witch", "third-witch"]); + }); +}); + +describe("applyMultiSpeakerMapToHtml", () => { + it("updates data-speaker based on the next line id", () => { + const html = ` +
+
+

Fair is foul, and foul is fair:

+

Hover through the fog and filthy air.

+
+
+ `; + + const map = new Map([ + ["ch1-p22-s1", ["first-witch", "second-witch", "third-witch"]], + ]); + + const result = applyMultiSpeakerMapToHtml(html, map); + expect(result).toContain('data-speaker="first-witch second-witch third-witch"'); + }); +}); diff --git a/apps/pipeline/src/tools/fix-legacy-play-multi-speaker.ts b/apps/pipeline/src/tools/fix-legacy-play-multi-speaker.ts new file mode 100644 index 00000000..2284c853 --- /dev/null +++ b/apps/pipeline/src/tools/fix-legacy-play-multi-speaker.ts @@ -0,0 +1,85 @@ +import { DOMParser, type Element as XMLElement } from "@xmldom/xmldom"; +import { JSDOM } from "jsdom"; + +function slugifyTagName(tagName: string): string { + return tagName + .toLowerCase() + .replace(/[^a-z0-9]+/g, "-") + .replace(/^-+|-+$/g, ""); +} + +function findFirstId(element: XMLElement): string | null { + const walker = element.getElementsByTagName("*"); + for (let i = 0; i < walker.length; i += 1) { + const node = walker.item(i) as XMLElement | null; + const id = node?.getAttribute("id"); + if (id) return id; + } + return null; +} + +function getTalkingTags(p: XMLElement): XMLElement[] { + const all = p.getElementsByTagName("*"); + const result: XMLElement[] = []; + for (let i = 0; i < all.length; i += 1) { + const node = all.item(i) as XMLElement | null; + if (node && node.getAttribute("talking") === "true") { + result.push(node); + } + } + return result; +} + +export function extractMultiSpeakerNextLineMapFromXml(xml: string): Map { + const parser = new DOMParser(); + const doc = parser.parseFromString(xml, "text/html"); + const paragraphs = Array.from(doc.getElementsByTagName("p")) as XMLElement[]; + const map = new Map(); + + for (let i = 0; i < paragraphs.length; i += 1) { + const p = paragraphs[i]; + const talking = getTalkingTags(p); + if (talking.length < 2) continue; + if (p.getElementsByTagName("strong").length === 0) continue; + + const speakers = Array.from( + new Set(talking.map((node) => slugifyTagName(node.tagName))), + ).filter(Boolean); + if (speakers.length < 2) continue; + + let nextId: string | null = null; + for (let j = i + 1; j < paragraphs.length; j += 1) { + nextId = findFirstId(paragraphs[j]); + if (nextId) break; + } + + if (nextId) { + map.set(nextId, speakers); + } + } + + return map; +} + +export function applyMultiSpeakerMapToHtml(html: string, map: Map): string { + if (map.size === 0) return html; + + const dom = new JSDOM(html); + const doc = dom.window.document; + const speakerBlocks = Array.from(doc.querySelectorAll("div[data-speaker][data-label]")); + let didFix = false; + + for (const block of speakerBlocks) { + const firstId = block.querySelector("[id]")?.getAttribute("id") ?? null; + if (!firstId) continue; + const speakers = map.get(firstId); + if (!speakers || speakers.length < 2) continue; + const joined = speakers.join(" "); + if (block.getAttribute("data-speaker") !== joined) { + block.setAttribute("data-speaker", joined); + didFix = true; + } + } + + return didFix ? doc.body.innerHTML : html; +} diff --git a/apps/pipeline/src/tools/fix-legacy-play-stage-directions.spec.ts b/apps/pipeline/src/tools/fix-legacy-play-stage-directions.spec.ts new file mode 100644 index 00000000..40cea2ac --- /dev/null +++ b/apps/pipeline/src/tools/fix-legacy-play-stage-directions.spec.ts @@ -0,0 +1,77 @@ +import { describe, expect, it } from "vitest"; +import { JSDOM } from "jsdom"; +import { fixLegacyPlayStageDirections } from "./fix-legacy-play-stage-directions"; + +function parseSection(html: string): Element { + const dom = new JSDOM(html); + const doc = dom.window.document; + const section = doc.querySelector("section[data-chapter]"); + if (!section) { + throw new Error("Missing section[data-chapter] in test input"); + } + return section; +} + +describe("fixLegacyPlayStageDirections", () => { + it("converts legacy character tags in stage directions to spans with data-c and data-enters", () => { + const input = ` +
+

+ Enter BENVOLIO and MERCUTIO +

+
+ `; + + const result = fixLegacyPlayStageDirections(input); + const section = parseSection(result); + + const benvolio = section.querySelector('span[data-c="benvolio"][data-enters="true"]'); + const mercutio = section.querySelector('span[data-c="mercutio"][data-enters="true"]'); + + expect(benvolio?.textContent).toBe("BENVOLIO"); + expect(mercutio?.textContent).toBe("MERCUTIO"); + + expect(section.querySelector("benvolio")).toBeNull(); + expect(section.querySelector("mercutio")).toBeNull(); + }); + + it("converts legacy exit tags to spans with data-exits", () => { + const input = ` +
+

+ Exeunt all but BENVOLIOSAMPSON +

+
+ `; + + const result = fixLegacyPlayStageDirections(input); + const section = parseSection(result); + + const benvolio = section.querySelector('span[data-c="benvolio"]'); + const sampson = section.querySelector('span[data-c="sampson"][data-exits="true"]'); + + expect(benvolio?.textContent).toBe("BENVOLIO"); + expect(sampson?.textContent).toBe("SAMPSON"); + }); + + it("preserves custom attributes and drops raw enters/exits attributes", () => { + const input = ` +
+

+ Enter CAPULET +

+
+ `; + + const result = fixLegacyPlayStageDirections(input); + const section = parseSection(result); + + const capulet = section.querySelector( + 'span[data-c="capulet"][data-enters="true"][dynasty="true"]', + ); + + expect(capulet?.textContent).toBe("CAPULET"); + expect(capulet?.hasAttribute("enters")).toBe(false); + expect(capulet?.hasAttribute("exits")).toBe(false); + }); +}); diff --git a/apps/pipeline/src/tools/fix-legacy-play-stage-directions.ts b/apps/pipeline/src/tools/fix-legacy-play-stage-directions.ts new file mode 100644 index 00000000..e0477c3a --- /dev/null +++ b/apps/pipeline/src/tools/fix-legacy-play-stage-directions.ts @@ -0,0 +1,113 @@ +import { JSDOM } from "jsdom"; + +const KNOWN_HTML_TAGS = new Set([ + "a", + "article", + "aside", + "b", + "blockquote", + "br", + "button", + "caption", + "code", + "div", + "em", + "h1", + "h2", + "h3", + "h4", + "h5", + "h6", + "hr", + "i", + "img", + "li", + "main", + "ol", + "p", + "section", + "small", + "span", + "strong", + "sub", + "sup", + "table", + "tbody", + "td", + "th", + "thead", + "tr", + "u", + "ul", +]); + +function slugifyTag(tagName: string): string { + return tagName + .toLowerCase() + .replace(/[^a-z0-9]+/g, "-") + .replace(/^-+|-+$/g, ""); +} + +function isConvertibleStageDirectionNode(element: Element): boolean { + const tagName = element.tagName.toLowerCase(); + if (KNOWN_HTML_TAGS.has(tagName)) return false; + if (!element.closest("em")) return false; + return true; +} + +function getAttrValue(element: Element, name: string): string | null { + if (element.hasAttribute(name)) return element.getAttribute(name); + return null; +} + +function copyAttributes(target: Element, source: Element, skip: Set): void { + for (const attr of Array.from(source.attributes)) { + const name = attr.name.toLowerCase(); + if (skip.has(name)) continue; + target.setAttribute(attr.name, attr.value); + } +} + +export function fixLegacyPlayStageDirections(html: string): string { + const dom = new JSDOM(html); + const doc = dom.window.document; + let didFix = false; + + const allElements = Array.from(doc.querySelectorAll("*")); + for (const element of allElements) { + if (!isConvertibleStageDirectionNode(element)) continue; + + const tagName = element.tagName.toLowerCase(); + const slug = slugifyTag(tagName); + if (!slug) continue; + + const span = doc.createElement("span"); + span.setAttribute("data-c", slug); + + const entersValue = getAttrValue(element, "enters") ?? getAttrValue(element, "data-enters"); + const exitsValue = getAttrValue(element, "exits") ?? getAttrValue(element, "data-exits"); + + copyAttributes( + span, + element, + new Set(["enters", "exits", "data-enters", "data-exits", "talking"]), + ); + + if (entersValue !== null) { + span.setAttribute("data-enters", entersValue || "true"); + } + if (exitsValue !== null) { + span.setAttribute("data-exits", exitsValue || "true"); + } + + const text = (element.textContent || "").trim(); + if (text.length > 0) { + span.textContent = text; + } + + element.replaceWith(span); + didFix = true; + } + + return didFix ? doc.body.innerHTML : html; +} diff --git a/apps/pipeline/src/tools/fix-non-play-custom-tags.spec.ts b/apps/pipeline/src/tools/fix-non-play-custom-tags.spec.ts new file mode 100644 index 00000000..a4a947bd --- /dev/null +++ b/apps/pipeline/src/tools/fix-non-play-custom-tags.spec.ts @@ -0,0 +1,119 @@ +import { describe, expect, it } from "vitest"; +import { JSDOM } from "jsdom"; +import { fixNonPlayCustomTags } from "./fix-non-play-custom-tags"; + +function parseSection(html: string): Element { + const dom = new JSDOM(html); + const doc = dom.window.document; + const section = doc.querySelector("section[data-chapter]"); + if (!section) { + throw new Error("Missing section[data-chapter] in test input"); + } + return section; +} + +describe("fixNonPlayCustomTags", () => { + it("converts note tags to link-note anchors", () => { + const input = ` +
+

Text beforeafter.

+
+ `; + + const result = fixNonPlayCustomTags(input); + const section = parseSection(result); + + const note = section.querySelector('a.link-note[data-note="2"]'); + expect(note?.textContent).toBe("2"); + expect(section.querySelector("note")).toBeNull(); + }); + + it("converts self-closing note tags to link-note anchors", () => { + const input = ` +
+

See here.

+
+ `; + + const result = fixNonPlayCustomTags(input); + const section = parseSection(result); + + const note = section.querySelector('a.link-note[data-note="448"]'); + expect(note?.textContent).toBe("448"); + expect(section.querySelector("note")).toBeNull(); + }); + + it("converts inline custom tags to spans with data-c", () => { + const input = ` +
+

Hyades, Hastur, and Aldebaran.

+
+ `; + + const result = fixNonPlayCustomTags(input); + const section = parseSection(result); + + const hastur = section.querySelector('span[data-c="hastur"]'); + expect(hastur?.textContent).toBe("Hastur"); + expect(section.querySelector("hastur")).toBeNull(); + }); + + it("promotes empty custom tags at start of a paragraph to data-speaker", () => { + const input = ` +
+

— Stój! Pal!

+
+ `; + + const result = fixNonPlayCustomTags(input); + const section = parseSection(result); + const paragraph = section.querySelector('p[data-index="123"]'); + + expect(paragraph?.getAttribute("data-speaker")).toBe("stoj-pal"); + expect(section.querySelector("stoj-pal")).toBeNull(); + expect(paragraph?.textContent?.trim().startsWith("— Stój! Pal!")).toBe(true); + }); + + it("treats self-closing talking tags at start as speakers", () => { + const input = ` +
+

'How doth the little crocodile

+
+ `; + + const result = fixNonPlayCustomTags(input); + const section = parseSection(result); + const paragraph = section.querySelector("p.verse"); + + expect(paragraph?.getAttribute("data-speaker")).toBe("alice"); + expect(section.querySelector("alice")).toBeNull(); + }); + + it("leaves hgroup tags intact", () => { + const input = ` +
+

Title

+
+ `; + + const result = fixNonPlayCustomTags(input); + const section = parseSection(result); + expect(section.querySelector("hgroup")).not.toBeNull(); + }); + + it("handles invalid see tags by preserving the reference", () => { + const input = ` +
+

+
+ `; + + const result = fixNonPlayCustomTags(input); + const section = parseSection(result); + + const see = section.querySelector('span[data-see="05.05.Sketch.gif"]'); + expect(see).not.toBeNull(); + expect(section.querySelector('span[data-c="see"]')).toBeNull(); + expect(section.querySelector("see")).toBeNull(); + }); +}); diff --git a/apps/pipeline/src/tools/fix-non-play-custom-tags.ts b/apps/pipeline/src/tools/fix-non-play-custom-tags.ts new file mode 100644 index 00000000..324be81e --- /dev/null +++ b/apps/pipeline/src/tools/fix-non-play-custom-tags.ts @@ -0,0 +1,268 @@ +import { JSDOM } from "jsdom"; + +const HTML_TAGS = new Set([ + "a", + "abbr", + "address", + "article", + "aside", + "audio", + "b", + "bdi", + "bdo", + "blockquote", + "body", + "br", + "button", + "canvas", + "caption", + "cite", + "code", + "col", + "colgroup", + "data", + "datalist", + "dd", + "del", + "details", + "dfn", + "dialog", + "div", + "dl", + "dt", + "em", + "fieldset", + "figcaption", + "figure", + "footer", + "form", + "h1", + "h2", + "h3", + "h4", + "h5", + "h6", + "head", + "header", + "hr", + "html", + "i", + "img", + "input", + "ins", + "kbd", + "label", + "legend", + "li", + "link", + "main", + "map", + "mark", + "meta", + "meter", + "nav", + "noscript", + "object", + "ol", + "optgroup", + "option", + "output", + "p", + "param", + "picture", + "pre", + "progress", + "q", + "rp", + "rt", + "ruby", + "s", + "samp", + "script", + "section", + "select", + "small", + "source", + "span", + "strong", + "style", + "sub", + "summary", + "sup", + "table", + "tbody", + "td", + "template", + "textarea", + "tfoot", + "th", + "thead", + "time", + "title", + "tr", + "track", + "u", + "ul", + "var", + "video", + "wbr", +]); + +const IGNORED_CUSTOM_TAGS = new Set(["hgroup"]); +const NON_SPEAKER_TAGS = new Set(["note", "see"]); + +function slugifyTag(tagName: string): string { + return tagName + .toLowerCase() + .replace(/[^a-z0-9]+/g, "-") + .replace(/^-+|-+$/g, ""); +} + +function copyAttributes(target: Element, source: Element, skip: Set): void { + for (const attr of Array.from(source.attributes)) { + const name = attr.name.toLowerCase(); + if (skip.has(name)) continue; + target.setAttribute(attr.name, attr.value); + } +} + +function normalizeInvalidSeeTags(html: string): string { + return html.replace(/]+)>/gi, (match, rawValue) => { + const value = String(rawValue).trim(); + if (!value || value.includes("=")) { + return match; + } + const escaped = value.replace(/"/g, """); + return ``; + }); +} + +function isCustomTag(tagName: string): boolean { + const lower = tagName.toLowerCase(); + if (HTML_TAGS.has(lower)) return false; + if (IGNORED_CUSTOM_TAGS.has(lower)) return false; + return true; +} + +function findFirstSignificantElement(p: Element): Element | null { + for (const node of Array.from(p.childNodes)) { + if (node.nodeType === 3) { + if (node.textContent?.trim()) return null; + continue; + } + if (node.nodeType === 1) return node as Element; + } + return null; +} + +function isEmptyElement(element: Element): boolean { + if (element.children.length > 0) return false; + return !(element.textContent ?? "").trim(); +} + +function unwrapElement(element: Element): void { + const parent = element.parentNode; + if (!parent) return; + while (element.firstChild) { + parent.insertBefore(element.firstChild, element); + } + parent.removeChild(element); +} + +function convertNotes(doc: Document): boolean { + let didFix = false; + const notes = Array.from(doc.querySelectorAll("note")); + for (const note of notes) { + const id = note.getAttribute("id") ?? ""; + const anchor = doc.createElement("a"); + anchor.className = "link-note"; + if (id) { + anchor.setAttribute("data-note", id); + anchor.textContent = id; + } + note.replaceWith(anchor); + didFix = true; + } + return didFix; +} + +function promoteSpeakerTags(doc: Document): boolean { + let didFix = false; + const paragraphs = Array.from(doc.querySelectorAll("p")); + for (const p of paragraphs) { + const firstElement = findFirstSignificantElement(p); + if (!firstElement) continue; + const tagName = firstElement.tagName.toLowerCase(); + if (!isCustomTag(tagName)) continue; + if (NON_SPEAKER_TAGS.has(tagName)) continue; + const hasTalking = firstElement.getAttribute("talking") === "true"; + if (!hasTalking && !isEmptyElement(firstElement)) continue; + + const slug = slugifyTag(tagName); + if (!slug) continue; + if (!p.hasAttribute("data-speaker")) { + p.setAttribute("data-speaker", slug); + } + if (firstElement.childNodes.length > 0) { + unwrapElement(firstElement); + } else { + firstElement.remove(); + } + didFix = true; + } + return didFix; +} + +function convertInlineCustomTags(doc: Document): boolean { + let didFix = false; + const elements = Array.from(doc.querySelectorAll("*")); + for (const element of elements) { + const tagName = element.tagName.toLowerCase(); + if (!isCustomTag(tagName)) continue; + if (tagName === "note") continue; + if (IGNORED_CUSTOM_TAGS.has(tagName)) continue; + + if (tagName === "see") { + const span = doc.createElement("span"); + copyAttributes(span, element, new Set(["talking"])); + const text = (element.textContent || "").trim(); + if (text.length > 0) { + span.textContent = text; + } + element.replaceWith(span); + didFix = true; + continue; + } + + const slug = slugifyTag(tagName); + if (!slug) continue; + + const span = doc.createElement("span"); + span.setAttribute("data-c", slug); + copyAttributes(span, element, new Set(["talking"])); + + const text = (element.textContent || "").trim(); + if (text.length > 0) { + span.textContent = text; + } + + element.replaceWith(span); + didFix = true; + } + return didFix; +} + +export function fixNonPlayCustomTags(html: string): string { + const normalized = normalizeInvalidSeeTags(html); + const dom = new JSDOM(normalized); + const doc = dom.window.document; + + const didFixNotes = convertNotes(doc); + const didFixSpeakers = promoteSpeakerTags(doc); + const didFixInline = convertInlineCustomTags(doc); + + if (didFixNotes || didFixSpeakers || didFixInline || normalized !== html) { + return doc.body.innerHTML; + } + return html; +} diff --git a/apps/pipeline/src/tools/fixLongXml.ts b/apps/pipeline/src/tools/fixLongXml.ts index 2b4f94d2..9fa1de99 100644 --- a/apps/pipeline/src/tools/fixLongXml.ts +++ b/apps/pipeline/src/tools/fixLongXml.ts @@ -60,7 +60,7 @@ function formatChapterElement(chapter: Element, serializer: XMLSerializer): stri function formatSource(xml: string): string { const parser = new DOMParser({ onError: () => {} }); - const doc = parser.parseFromString(xml, "text/xml"); + const doc = parser.parseFromString(xml, "text/html"); const serializer = new XMLSerializer(); // If the file is just one , format that; otherwise format all Chapters found. diff --git a/apps/pipeline/src/tools/generate-book-cli/wrapChaptersWithSections.ts b/apps/pipeline/src/tools/generate-book-cli/wrapChaptersWithSections.ts index a54cc7de..3a8df7c2 100644 --- a/apps/pipeline/src/tools/generate-book-cli/wrapChaptersWithSections.ts +++ b/apps/pipeline/src/tools/generate-book-cli/wrapChaptersWithSections.ts @@ -17,7 +17,7 @@ import { JSDOM } from "jsdom"; * whitespace‑agnostic. */ export function wrapChaptersWithSections(xml: string): string { - const dom = new JSDOM(xml, { contentType: "text/xml" }); + const dom = new JSDOM(xml, { contentType: "text/html" }); const doc = dom.window.document; const NS = doc.documentElement.namespaceURI || null; diff --git a/apps/pipeline/src/tools/getParagraphsFromChapterWithText.spec.ts b/apps/pipeline/src/tools/getParagraphsFromChapterWithText.spec.ts new file mode 100644 index 00000000..21581639 --- /dev/null +++ b/apps/pipeline/src/tools/getParagraphsFromChapterWithText.spec.ts @@ -0,0 +1,34 @@ +import { describe, it, expect } from "vitest"; +import { getParagraphsFromChapterWithText } from "./getParagraphsFromChapterWithText"; + +describe("getParagraphsFromChapterWithText", () => { + it("preserves curly double quotes inside attribute values", () => { + const bookText = ` +
+
+ A note with “ll” and “and” visible. +
+
+ `; + + const paragraphs = getParagraphsFromChapterWithText(1, bookText); + const html = paragraphs[0]?.text ?? ""; + + expect(html).toContain("“ll”"); + expect(html).toContain("“and”"); + expect(html).not.toContain('"ll"'); + }); + + it("preserves curly double quotes in text content", () => { + const bookText = ` +
+

He said “hello” and left.

+
+ `; + + const paragraphs = getParagraphsFromChapterWithText(1, bookText); + const html = paragraphs[0]?.text ?? ""; + + expect(html).toContain("“hello”"); + }); +}); diff --git a/apps/pipeline/src/tools/getParagraphsFromChapterWithText.ts b/apps/pipeline/src/tools/getParagraphsFromChapterWithText.ts index f475ba3b..5ed2a66b 100644 --- a/apps/pipeline/src/tools/getParagraphsFromChapterWithText.ts +++ b/apps/pipeline/src/tools/getParagraphsFromChapterWithText.ts @@ -46,8 +46,6 @@ export const getParagraphsFromChapterWithText = ( .filter((element) => element?.text.length > 0) .map((pageText, index) => { const text = pageText.text - .replace(/\u201c/g, '"') - .replace(/\u201d/g, '"') .replace(/\u2019/g, "'") .replace(/\u2018/g, "'") .replace(/\u2013/g, "-") diff --git a/apps/pipeline/src/tools/identifyEntityAndRewriteParagraphs.ts b/apps/pipeline/src/tools/identifyEntityAndRewriteParagraphs.ts index c5afb17f..afac8e2f 100644 --- a/apps/pipeline/src/tools/identifyEntityAndRewriteParagraphs.ts +++ b/apps/pipeline/src/tools/identifyEntityAndRewriteParagraphs.ts @@ -1,4 +1,4 @@ -import { callClaude, callGeminiWrapper } from "../callClaude"; +import { callGeminiWrapper } from "../callClaude"; import { getParagraphsFromChapter, getSectionAttributesFromChapter, @@ -7,6 +7,8 @@ import { logger } from "../logger"; import fs from "fs"; import { compareXmlTextContent } from "./new-tooling/compare-chapters-xml"; import { restoreOriginalTextInHtml } from "./new-tooling/restore-text-in-html"; +import { restoreUnwrappedLines } from "./new-tooling/restore-unwrapped-lines"; +import { sanitizeNestedParagraphs } from "./new-tooling/sanitize-nested-paragraphs"; import path from "path"; import { type NewReferenceCardsResponse } from "../types"; import { writeBookFile } from "../helpers/writeBookFile"; @@ -122,7 +124,13 @@ async function processChunk( ); writeBookFile(`compiled-prompt-for-chapter-${chapter}-chunk-${chunkIndex}.md`, compiledPrompt); - const llmProviders = [callGeminiWrapper, callGrok, callClaude, callGpt5]; + const llmProviders = [ + callGeminiWrapper, + callGeminiWrapper, + callGeminiWrapper, + callGrok, + callGpt5, + ]; try { const selectedProvider = llmProviders[attempt % llmProviders.length]; @@ -140,6 +148,10 @@ async function processChunk( logger.info(`Response for chapter ${chapter} chunk ${chunkIndex}:`, response.slice(0, 50)); const clearedResponse = response.replace(/```xml\n/, "").replace(/\n```$/, ""); + writeBookFile( + `rewritten-paragraphs-for-chapter-${chapter}-chunk-${chunkIndex}-${selectedProvider.name}.raw.xml`, + clearedResponse, + ); let restored = clearedResponse; try { @@ -148,6 +160,21 @@ async function processChunk( logger.error(`Error restoring original text for chapter ${chapter} chunk ${chunkIndex}`, e); } + try { + restored = restoreUnwrappedLines(originalChunkXml, restored); + } catch (e) { + logger.error(`Error restoring unwrapped lines for chapter ${chapter} chunk ${chunkIndex}`, e); + } + + try { + restored = sanitizeNestedParagraphs(restored); + } catch (e) { + logger.error( + `Error sanitizing nested paragraphs for chapter ${chapter} chunk ${chunkIndex}`, + e, + ); + } + if (restored && compareXmlTextContent(originalChunkXml, restored)) { logger.info(`✅ Chunk ${chunkIndex} validated for chapter ${chapter}`); writeBookFile(`${chunkFileName.replace(".xml", "")}-${selectedProvider.name}.xml`, restored); @@ -271,10 +298,8 @@ export const identifyAndRewriteParagraphs = async ( writeBookFile(`compiled-prompt-for-chapter-${chapter}-gemini2.md`, compiledPrompt); - // const llmProviders = [callGeminiWrapper, callClaude]; + const llmProviders = [callGeminiWrapper, callGeminiWrapper, callGrok, callGpt5]; - const llmProviders = [callGeminiWrapper, callGrok, callClaude, callGpt5]; - // const llmProviders = [callGeminiWrapper]; try { const selectedProvider = llmProviders[attempt % llmProviders.length]; logger.info("Using provider: " + selectedProvider.name); @@ -284,6 +309,10 @@ export const identifyAndRewriteParagraphs = async ( "identify entities for paragraph response for chapter " + chapter, response.slice(0, 50), ); + writeBookFile( + `identify-entities-for-paragraph-response-for-chapter-${chapter}-${selectedProvider.name}.raw.txt`, + response, + ); const clearedResponse = response.replace(/```xml\n/, "").replace(/\n```$/, ""); let restored = clearedResponse; @@ -293,6 +322,18 @@ export const identifyAndRewriteParagraphs = async ( logger.error("Error restoring original text for chapter " + chapter, e); } + try { + restored = restoreUnwrappedLines(paragraphsForPage, restored); + } catch (e) { + logger.error("Error restoring unwrapped lines for chapter " + chapter, e); + } + + try { + restored = sanitizeNestedParagraphs(restored); + } catch (e) { + logger.error("Error sanitizing nested paragraphs for chapter " + chapter, e); + } + if (restored && compareXmlTextContent(paragraphsForPage, restored)) { // Build section attributes string, including format and any preserved epub-type const formatAttr = chapterFormat !== "prose" ? ` data-chapter-format="${chapterFormat}"` : ""; @@ -337,10 +378,9 @@ export const identifyCharactersAndRewriteParagraphs = async ( ) => { const bookSettings = getBookSettings(); - const charactersForChapter = referenceCards.characters.map((c) => ({ - name: c.name, - summary: c.referenceCard, - })); + const charactersForChapter = referenceCards.characters + .filter((c) => c.name !== "generic-avatar") // Exclude synthetic generic-avatar from LLM prompts + .map((c) => ({ name: c.name, summary: c.referenceCard })); const jsonCharacters = buildJsonCharacters(charactersForChapter); // Prepare all chapter data diff --git a/apps/pipeline/src/tools/importScannedBook.ts b/apps/pipeline/src/tools/importScannedBook.ts index 8b0cec71..b47969aa 100644 --- a/apps/pipeline/src/tools/importScannedBook.ts +++ b/apps/pipeline/src/tools/importScannedBook.ts @@ -27,6 +27,7 @@ import type { ChapterDetectionResult } from "../scan-server/chapterDetector"; import type { BookAnalysis, BookCharacter } from "../scan-server/ocrSchema"; import { generateBookHtml, type GeneratedChapterHtml } from "../scan-server/htmlGenerator"; import { generateCharacterImageWithOpenAI } from "./new-tooling/generate-pictures-for-entities"; +import { computeParagraphCount } from "../lib/paragraphCount"; import "dotenv/config"; const SCANNED_BOOKS_DIR = path.resolve(__dirname, "../../scanned-books"); @@ -386,8 +387,9 @@ async function step4_ImportChapters( console.log(` Importing ${chapters.length} chapters`); for (const chapter of chapters) { + const paragraphCount = computeParagraphCount(chapter.html); console.log( - ` Chapter ${chapter.chapterNumber}: ${chapter.title || "(no title)"} (${chapter.paragraphCount} paragraphs)`, + ` Chapter ${chapter.chapterNumber}: ${chapter.title || "(no title)"} (${paragraphCount} paragraphs)`, ); // Upload HTML content @@ -408,6 +410,7 @@ async function step4_ImportChapters( basename, chapterNumber: chapter.chapterNumber, title: chapter.title || undefined, + paragraphCount, sourceFormat: "HTML", }); } diff --git a/apps/pipeline/src/tools/importScannedBookIncremental.ts b/apps/pipeline/src/tools/importScannedBookIncremental.ts index 954375b9..6a3be17c 100644 --- a/apps/pipeline/src/tools/importScannedBookIncremental.ts +++ b/apps/pipeline/src/tools/importScannedBookIncremental.ts @@ -15,6 +15,7 @@ import type { DetectedChapter } from "../scan-server/chapterDetector"; import type { ChapterAnalysis, BookCharacter } from "../scan-server/ocrSchema"; import { generateChapterHtml } from "../scan-server/htmlGenerator"; import { generateCharacterImageWithOpenAI } from "./new-tooling/generate-pictures-for-entities"; +import { computeParagraphCount } from "../lib/paragraphCount"; const SCANNED_BOOKS_DIR = path.resolve(__dirname, "../../scanned-books"); @@ -330,6 +331,7 @@ async function importChapter( // Upload HTML const folderPath = `${bookPath}/chapters-source`; const basename = `chapter-${chapter.chapterNumber}.html`; + const paragraphCount = computeParagraphCount(generated.html); await convex.uploadFile({ folderPath, @@ -345,6 +347,7 @@ async function importChapter( basename, chapterNumber: chapter.chapterNumber, title: chapter.title || undefined, + paragraphCount, sourceFormat: "HTML", }); diff --git a/apps/pipeline/src/tools/new-tooling/ensure-section-wrapper.spec.ts b/apps/pipeline/src/tools/new-tooling/ensure-section-wrapper.spec.ts new file mode 100644 index 00000000..dfaf82fa --- /dev/null +++ b/apps/pipeline/src/tools/new-tooling/ensure-section-wrapper.spec.ts @@ -0,0 +1,19 @@ +import { describe, expect, it } from "vitest"; +import { ensureSectionWrapper } from "./ensure-section-wrapper"; + +describe("ensureSectionWrapper", () => { + it("passes through valid section wrappers", () => { + const input = '

Hi

'; + expect(ensureSectionWrapper(input)).toBe(input); + }); + + it("accepts additional attributes", () => { + const input = '

Hi

'; + expect(ensureSectionWrapper(input)).toBe(input); + }); + + it("throws when section wrapper is missing", () => { + const input = "

Hi

"; + expect(() => ensureSectionWrapper(input)).toThrow("Missing
wrapper"); + }); +}); diff --git a/apps/pipeline/src/tools/new-tooling/ensure-section-wrapper.ts b/apps/pipeline/src/tools/new-tooling/ensure-section-wrapper.ts new file mode 100644 index 00000000..ae943345 --- /dev/null +++ b/apps/pipeline/src/tools/new-tooling/ensure-section-wrapper.ts @@ -0,0 +1,9 @@ +export function ensureSectionWrapper(html: string): string { + const match = html.match( + /^\s*]*data-chapter\s*=\s*['"]?\d+['"]?[^>]*>[\s\S]*<\/section>\s*$/i, + ); + if (!match) { + throw new Error("Missing
wrapper"); + } + return html; +} diff --git a/apps/pipeline/src/tools/new-tooling/generate-flux-schnel-image.ts b/apps/pipeline/src/tools/new-tooling/generate-flux-schnel-image.ts index 5a42c43f..5acd7c02 100644 --- a/apps/pipeline/src/tools/new-tooling/generate-flux-schnel-image.ts +++ b/apps/pipeline/src/tools/new-tooling/generate-flux-schnel-image.ts @@ -8,6 +8,7 @@ import { sanitizePromptForModeration, generateAbstractPortraitPrompt, } from "./generate-pictures-for-entities"; +import { logError } from "../../helpers/logError"; const replicate = new Replicate({ auth: process.env.REPLICATE_API_TOKEN }); @@ -138,22 +139,47 @@ export const generateFluxImage = async ( finalPrompt = `${generalPrompt} Only scene-setting environment. ${prompt}`; } + // this is input for flux-2-pro + // const input = { + // aspect_ratio: type === "avatar" ? "1:1" : "16:9", + // input_images: [], + // output_format: type === "background" ? "webp" : "png", + // output_quality: 80, + // prompt: finalPrompt, + // resolution: "1 MP", + // safety_tolerance: 5, + // seed: 43605, + // }; + const input = { + images: [], + prompt: finalPrompt, + go_fast: false, aspect_ratio: type === "avatar" ? "1:1" : "16:9", - input_images: [], output_format: type === "background" ? "webp" : "png", output_quality: 80, - prompt: finalPrompt, - resolution: "1 MP", - safety_tolerance: 5, - seed: 43605, + output_megapixels: "1", + disable_safety_checker: true, }; try { - const output = await replicate.run("black-forest-labs/flux-2-pro", { input }); + const output = await replicate.run("black-forest-labs/flux-2-klein-4b", { input }); - // @ts-expect-error wrong types of replicate - flux-2-pro returns object with .url() method - const url = output.url(); + let url: string; + try { + // @ts-expect-error wrong types of replicate - flux-2-pro returns object with .url() method + url = output.url(); + } catch { + try { + url = (output as unknown as { url: () => string }[])[0].url(); + } catch (e) { + logError("Failed to get URL from output", e); + } + } + + if (!url!) { + throw new Error("Failed to get URL from output"); + } logger.info(`Replicate returned URL: ${url}`); @@ -198,7 +224,7 @@ export const generateFluxImage = async ( if (require.main === module) { generateFluxImage( - "A beautiful woman with long brown hair and blue eyes", + "A beautiful woman with long brown hair and blue eyes, seductive pose, naked, red lips, sexy, elegant, beautiful, 18 years old, breasts exposed", "test", "SinCity style", "avatar", diff --git a/apps/pipeline/src/tools/new-tooling/generate-pictures-for-entities.ts b/apps/pipeline/src/tools/new-tooling/generate-pictures-for-entities.ts index ff32a7da..ad76f011 100644 --- a/apps/pipeline/src/tools/new-tooling/generate-pictures-for-entities.ts +++ b/apps/pipeline/src/tools/new-tooling/generate-pictures-for-entities.ts @@ -280,7 +280,10 @@ ${chapters .join("\n")} `; - const charactersXml = characterNames.map((name) => ``).join("\n"); + const charactersXml = characterNames + .filter((name) => name !== "generic-avatar") + .map((name) => ``) + .join("\n"); prompt = initialPrompt .replace("{{characters}}", `${charactersXml}`) .replace("{{bookText}}", bookText); @@ -288,6 +291,11 @@ ${chapters const response = await callGeminiWithThinkingAndSchemaAndParsed(prompt, CharactersSchema); logger.info(`Response: `, response); + response.characters.push({ + name: "generic-avatar", + visualGuide: + "A mysterious figure shown from behind or in silhouette. No distinct facial features visible. Anonymous, sexless, suitable for representing any unnamed character. Atmospheric lighting with the figure partially obscured by shadow or mist.", + }); return response; }; @@ -300,7 +308,15 @@ export const generatePicturesForEntities = async ( let generatedPrompts: CharactersType; if (bookFileExists("generated-prompts.json", FILE_TYPE.TEMPORARY)) { generatedPrompts = JSON.parse(readBookFile("generated-prompts.json", FILE_TYPE.TEMPORARY)); - console.log("inside generated prompts"); + console.log("[generatePicturesForEntities] Using cached generated-prompts.json"); + console.log( + "[generatePicturesForEntities] Cached characters:", + generatedPrompts.characters.map((c) => c.name), + ); + console.log( + "[generatePicturesForEntities] Reference cards characters:", + referenceCards.characters.map((c) => c.name), + ); } else { generatedPrompts = await generatePicturePrompts(referenceCards, { skipBookAnalysis }); writeBookFile( @@ -331,18 +347,6 @@ Propaganda posters for their graphic boldness and limited color palette. filteredPrompts.map(async (prompt) => { if (!knownCharactersArray.includes(prompt.name)) { console.log("Generating for ", prompt.name); - - // const translationPrompt = `Process the following draft of a visual prompt: "${prompt.visualGuide}". Remove relations (who is a cousin to who, etc), information about what happens to that person, etc. - // Remove any indication of episodic things, for example someone getting a bruise later. Leave this as a purely visual information, based on what we know. - // Remove any indication of nudity, sexual content, etc. Remove suggestions that someone is naked or descriptions of private body parts. - // If prompt in different language than English, translate it to English. - // Reply with prompt directly, without any other text, so this can be used directly as a prompt for image generation. Do not say: "Here is the prompt" or "understood", just reply with the prompt.`; - // const visulGuideTranslatedAndCleaned = await callClaude(translationPrompt, undefined, 10, 0); - - // console.log(visulGuideTranslatedAndCleaned); - // const image = await generateImage(visulGuideTranslatedAndCleaned, prompt.name); - // const image = await generateImage(visulGuideTranslatedAndCleaned, prompt.name); - await generateAndSaveCharacterImage(prompt.visualGuide, prompt.name, generalPrompt); } }), diff --git a/apps/pipeline/src/tools/new-tooling/generate-prompts-for-backgrounds.ts b/apps/pipeline/src/tools/new-tooling/generate-prompts-for-backgrounds.ts index c132a1ae..0b034306 100644 --- a/apps/pipeline/src/tools/new-tooling/generate-prompts-for-backgrounds.ts +++ b/apps/pipeline/src/tools/new-tooling/generate-prompts-for-backgrounds.ts @@ -12,7 +12,7 @@ import { generateImageWithFluxToFolder, } from "./generate-flux-schnel-image"; import { type GraphicalStyle } from "./create-graphical-style"; -import { callSlowGeminiWithThinkingAndSchemaAndParsed } from "../../callFastGemini"; +import { callGeminiWithThinkingAndSchemaAndParsed } from "../../callFastGemini"; import { generateCharacterImageWithOpenAI } from "./generate-pictures-for-entities"; import { bookFileExists } from "../../helpers/bookFileExists"; import type { NewReferenceCardsResponse } from "../../types"; @@ -104,6 +104,11 @@ export type GenerateBackgroundsOptions = { }; export const generateBackgrounds = async (options: GenerateBackgroundsOptions = {}) => { + if (FREE_RUN) { + logger.info("FREE_RUN enabled - skipping background generation."); + return; + } + const { customStyle, chapterNumbers, @@ -177,7 +182,11 @@ Chapter Text: ${chapter.content} }`; const schema = z.object({ sceneDescription: z.string() }); - const response = await callSlowGeminiWithThinkingAndSchemaAndParsed(prompt, schema); + const response = await callGeminiWithThinkingAndSchemaAndParsed( + prompt, + schema, + "gemini-3-flash-preview", + ); console.log(`${chapter.number} - ${JSON.stringify(response)}`); return { chapter: chapter.number, @@ -340,7 +349,11 @@ Chapter Text: ${chapter.content} }`; const schema = z.object({ sceneDescription: z.string() }); - const response = await callSlowGeminiWithThinkingAndSchemaAndParsed(prompt, schema); + const response = await callGeminiWithThinkingAndSchemaAndParsed( + prompt, + schema, + "gemini-3-flash-preview", + ); const outputFolder = "style-previews"; const generator = FREE_RUN ? generateImageWithFluxToFolder : generateImageWithOpenAIToFolder; diff --git a/apps/pipeline/src/tools/new-tooling/get-chapter-by-chapter-summary.ts b/apps/pipeline/src/tools/new-tooling/get-chapter-by-chapter-summary.ts index c0b4d857..814c9dd2 100644 --- a/apps/pipeline/src/tools/new-tooling/get-chapter-by-chapter-summary.ts +++ b/apps/pipeline/src/tools/new-tooling/get-chapter-by-chapter-summary.ts @@ -1,10 +1,11 @@ -import { callClaude, callGeminiWrapper } from "../../callClaude"; +import { callGeminiWrapper } from "../../callClaude"; import { getChaptersUpTo } from "../../helpers/getChaptersUpTo"; import { getBookSettings } from "../../helpers/getBookSettings"; import { writeBookFile } from "../../helpers/writeBookFile"; import { readBookFile } from "../../helpers/readBookFile"; import { FILE_TYPE } from "../../helpers/filesHelpers"; import { logger } from "../../logger"; +import { callGrokAzure } from "../../callGrokAzure"; export const makeRollingChapterSummaries = async () => { const bookSettings = getBookSettings(); @@ -78,7 +79,7 @@ Provide your summary clearly organized according to the structure above, explici `; // Use `prompt` with your LLM here and store the output as `summary` - const llmProviders = [callGeminiWrapper, callClaude]; + const llmProviders = [callGrokAzure, callGeminiWrapper]; const selectedProvider = llmProviders[attempt % llmProviders.length]; try { const summary = await selectedProvider( diff --git a/apps/pipeline/src/tools/new-tooling/get-chapter-by-chapter-with-paragraphs-json-summary.ts b/apps/pipeline/src/tools/new-tooling/get-chapter-by-chapter-with-paragraphs-json-summary.ts index 49c6e6f4..fc45f306 100644 --- a/apps/pipeline/src/tools/new-tooling/get-chapter-by-chapter-with-paragraphs-json-summary.ts +++ b/apps/pipeline/src/tools/new-tooling/get-chapter-by-chapter-with-paragraphs-json-summary.ts @@ -6,7 +6,9 @@ import { getBookSettings } from "../../helpers/getBookSettings"; import { readBookFile } from "../../helpers/readBookFile"; import { FILE_TYPE } from "../../helpers/filesHelpers"; import { writeBookFile } from "../../helpers/writeBookFile"; -import { callSlowGeminiWithThinkingAndSchemaAndParsed } from "../../callFastGemini"; +import { callGeminiWithThinkingAndSchemaAndParsed } from "../../callFastGemini"; +import { callGrokAzureWithSchema } from "../../callGrokAzure"; +import { buildParagraphsForSummary } from "./summaryParagraphs"; // Define the schema for reference cards response const ScenesSummariesPerChapterSchema = z.object({ @@ -41,12 +43,7 @@ export async function generateSingleChapterSummary( ): Promise { const { chapterNum, paragraphs, rollingSummary, bookLanguage = "English" } = options; - const paragraphsForPage = paragraphs - .map( - (paragraph) => - `

${paragraph.text.trim().replace(/"/g, "'")}

`, - ) - .join("\n"); + const paragraphsForPage = buildParagraphsForSummary(paragraphs); const prompt = ` ## Fiction Book Chapter Summary @@ -107,11 +104,12 @@ Provide your summary clearly organized according to the structure above, explici let summary: ScenesSummariesPerChapter; try { - summary = (await callSlowGeminiWithThinkingAndSchemaAndParsed( + summary = (await callGeminiWithThinkingAndSchemaAndParsed( bookLanguage === "Polish" ? `${prompt}\n Książka jest po Polsku, więc napisz podsumowanie również po Polsku.` : prompt, ScenesSummariesPerChapterSchema, + "gemini-3-flash-preview", )) as ScenesSummariesPerChapter; } catch (e) { console.error(`Error for chapter ${chapterNum}`, e); @@ -169,12 +167,7 @@ export const turnChapterSummariesIntoBulletPointsMappedToParagraphs = async () = const paragraphsFromChapter = getParagraphsFromChapter(chapterNum, true, true); - const paragraphsForPage = paragraphsFromChapter - .map( - (paragraph) => - `

${paragraph.text.trim().replace(/"/g, "'")}

`, - ) - .join("\n"); + const paragraphsForPage = buildParagraphsForSummary(paragraphsFromChapter); const prompt = ` ## Fiction Book Chapter Summary @@ -235,17 +228,17 @@ Provide your summary clearly organized according to the structure above, explici let summary: ScenesSummariesPerChapter; try { - summary = (await callSlowGeminiWithThinkingAndSchemaAndParsed( + summary = (await callGrokAzureWithSchema( `${prompt}\n Reply in the language of the book. It's usually Polish or English. Your instructions are in English so you often reply in English, buts its VERY important to reply in Polish when the book is in Polish, and same goes for other languages..`, ScenesSummariesPerChapterSchema, )) as ScenesSummariesPerChapter; } catch (e) { console.error(`Error for chapter ${chapterNum}`, e); try { - summary = (await callClaude( + summary = (await callGeminiWithThinkingAndSchemaAndParsed( `${prompt}\n Reply in the language of the book. It's usually Polish or English. Your instructions are in English so you often reply in English, buts its VERY important to reply in Polish when the book is in Polish, and same goes for other languages.`, ScenesSummariesPerChapterSchema, - 2, + "gemini-3-flash-preview", )) as ScenesSummariesPerChapter; } catch (e) { console.error(`Error for chapter ${chapterNum}`, e); diff --git a/apps/pipeline/src/tools/new-tooling/get-chapter-title.spec.ts b/apps/pipeline/src/tools/new-tooling/get-chapter-title.spec.ts new file mode 100644 index 00000000..7abdf0bd --- /dev/null +++ b/apps/pipeline/src/tools/new-tooling/get-chapter-title.spec.ts @@ -0,0 +1,107 @@ +import { DOMParser, type Element as XMLElement } from "@xmldom/xmldom"; +import { describe, it, expect } from "vitest"; +import { getChapterTitle } from "./get-chapter-title"; + +describe("getChapterTitle", () => { + it("should return the chapter title", () => { + const chapter = `Chapter 1Content 1`; + const parser = new DOMParser(); + const doc = parser.parseFromString(chapter, "text/xml"); + const root = doc.documentElement as XMLElement; + + expect(getChapterTitle(root)).toBe("Chapter 1"); + }); + + it("should handle hgroup with label, ordinal, and title", () => { + const chapter = `
+
+

+ Book + II +

+

The Castle

+
+
`; + const parser = new DOMParser(); + const doc = parser.parseFromString(chapter, "text/xml"); + const root = doc.documentElement as XMLElement; + + expect(getChapterTitle(root)).toBe("Book II: The Castle"); + }); + + it("should handle hgroup with ordinal and title", () => { + const chapter = `
+
+

I

+

I Go to Styles

+
+
`; + const parser = new DOMParser(); + const doc = parser.parseFromString(chapter, "text/xml"); + const root = doc.documentElement as XMLElement; + + expect(getChapterTitle(root)).toBe("I: I Go to Styles"); + }); + + it("should use data-epub-type as title when no hgroup with title exists", () => { + const chapter = `
+

To my Mother

+
`; + const parser = new DOMParser(); + const doc = parser.parseFromString(chapter, "text/xml"); + const root = doc.documentElement as XMLElement; + + expect(getChapterTitle(root)).toBe("Dedication"); + }); + + it("should handle hgroup with title but no h2", () => { + const chapter = `
+
+

Prologue

+
+
`; + const parser = new DOMParser(); + const doc = parser.parseFromString(chapter, "text/xml"); + const root = doc.documentElement as XMLElement; + + expect(getChapterTitle(root)).toBe("Prologue"); + }); + + it("should handle hgroup with title and h2 but no ordinal", () => { + const chapter = `
+
+

Introduction

+

The Beginning

+
+
`; + const parser = new DOMParser(); + const doc = parser.parseFromString(chapter, "text/xml"); + const root = doc.documentElement as XMLElement; + + expect(getChapterTitle(root)).toBe("The Beginning"); + }); + + it("should handle legacy chapter with act (h3) element", () => { + const chapter = ` +

Act I

+ The Opening +
`; + const parser = new DOMParser(); + const doc = parser.parseFromString(chapter, "text/xml"); + const root = doc.documentElement as XMLElement; + + expect(getChapterTitle(root)).toBe("Act I, The Opening"); + }); + + it("should handle legacy chapter with title and subtitle", () => { + const chapter = ` + Chapter One. + In which our hero begins + `; + const parser = new DOMParser(); + const doc = parser.parseFromString(chapter, "text/xml"); + const root = doc.documentElement as XMLElement; + + expect(getChapterTitle(root)).toBe("Chapter One, In which our hero begins"); + }); +}); diff --git a/apps/pipeline/src/tools/new-tooling/get-chapter-title.ts b/apps/pipeline/src/tools/new-tooling/get-chapter-title.ts index 5ef509f2..fd9bc463 100644 --- a/apps/pipeline/src/tools/new-tooling/get-chapter-title.ts +++ b/apps/pipeline/src/tools/new-tooling/get-chapter-title.ts @@ -2,16 +2,75 @@ import { type Element as XMLElement } from "@xmldom/xmldom"; const getTitleText = (el?: XMLElement | null) => (el ? (el.textContent || "").trim() : ""); -export const getChapterTitle = (chapter: XMLElement): string => { - let currentAct = ""; +const getAttribute = (el: XMLElement, name: string): string | null => { + const attr = el.getAttribute(name); + return attr ? attr.trim() : null; +}; + +const hasEpubType = (el: XMLElement, type: string): boolean => { + const epubType = getAttribute(el, "data-epub-type"); + return epubType ? epubType.includes(type) : false; +}; + +const extractLabelAndOrdinalFromSpans = (h2: XMLElement): { label: string; ordinal: string } => { + let label = ""; + let ordinal = ""; + + const spans = h2.getElementsByTagName("span"); + for (let i = 0; i < spans.length; i++) { + const span = spans[i] as XMLElement; + const spanEpubType = getAttribute(span, "data-epub-type"); + if (spanEpubType === "label") { + label = getTitleText(span); + } else if (spanEpubType && spanEpubType.includes("ordinal")) { + ordinal = getTitleText(span); + } + } - if (chapter.getElementsByTagName("h2").length > 0) { - console.warn("h2 found in chapter, not supported yet"); + return { label, ordinal }; +}; + +const formatTitleWithOrdinal = (label: string, ordinal: string, title: string): string => { + if (label && ordinal) { + return `${label} ${ordinal}: ${title}`; } - if (chapter.getElementsByTagName("h1").length > 0) { - console.warn("h1 found in chapter, not supported yet"); + if (ordinal) { + return `${ordinal}: ${title}`; + } + return title; +}; + +const getTitleFromHgroup = (hgroup: XMLElement): string | null => { + const titleParagraphs = Array.from(hgroup.getElementsByTagName("p")).filter((p) => + hasEpubType(p as XMLElement, "title"), + ); + + if (titleParagraphs.length === 0) { + return null; } + const titleText = getTitleText(titleParagraphs[0] as XMLElement); + const h2Elements = hgroup.getElementsByTagName("h2"); + + if (h2Elements.length === 0) { + return titleText; + } + + const h2 = h2Elements[0] as XMLElement; + + // Check if h2 itself has ordinal attribute + if (hasEpubType(h2, "ordinal")) { + return formatTitleWithOrdinal("", getTitleText(h2), titleText); + } + + // Check for spans within h2 + const { label, ordinal } = extractLabelAndOrdinalFromSpans(h2); + return formatTitleWithOrdinal(label, ordinal, titleText); +}; + +const getLegacyChapterTitle = (chapter: XMLElement): string => { + let currentAct = ""; + const actElements = chapter.getElementsByTagName("h3").length > 0 ? chapter.getElementsByTagName("h3") @@ -31,16 +90,36 @@ export const getChapterTitle = (chapter: XMLElement): string => { currentAct = getTitleText(actElements[0]); } + console.log(`titleElements: ${titleElements.length}`); + const titleText = getTitleText(titleElements[0]); const subtitleText = getTitleText(subtitleElements[0]); - const chapterTitle = [ + return [ currentAct, titleText && subtitleText ? titleText.replace(/\.$/, "") : titleText, subtitleText, ] .filter(Boolean) .join(", "); +}; + +export const getChapterTitle = (chapter: XMLElement): string => { + // Check for hgroup structure first + const hgroups = chapter.getElementsByTagName("hgroup"); + if (hgroups.length > 0) { + const hgroupTitle = getTitleFromHgroup(hgroups[0] as XMLElement); + if (hgroupTitle) { + return hgroupTitle; + } + } + + // If no hgroup with title, check for data-epub-type on the root element + const epubType = getAttribute(chapter, "data-epub-type"); + if (epubType) { + return epubType.charAt(0).toUpperCase() + epubType.slice(1); + } - return chapterTitle; + // Fall back to existing logic for backward compatibility + return getLegacyChapterTitle(chapter); }; diff --git a/apps/pipeline/src/tools/new-tooling/get-reference-cards-for-whole-book-prompt.md b/apps/pipeline/src/tools/new-tooling/get-reference-cards-for-whole-book-prompt.md index 5ab75ae5..519b60bf 100644 --- a/apps/pipeline/src/tools/new-tooling/get-reference-cards-for-whole-book-prompt.md +++ b/apps/pipeline/src/tools/new-tooling/get-reference-cards-for-whole-book-prompt.md @@ -1,7 +1,7 @@ # Task Process a text chapter by chapter. For each story character who appears or is mentioned, create reference cards reflecting the knowledge about this person - how would a human introduce someone to that character without spoiling it. The information should be based mostly on when we first meet the character, but pointing towards the knowledge we know about him from the whole book - so avoid spoilers, but use the later revealed facts to determine whats important about the initial impression. -Maybe when we first meet the character he is working on his car in his garage. If in the rest of his book he does that from time to time, or we learn he is a mechanic, or a driver, or whatever like that, that's important detail. But if he is not mentioned in the context of cars again, that is irrelevant detail. No spoilers! Do not mention how things end or who they become. Only the most generic but relevant information. So skip anything that's surprising or important action that happened in the book, but build the background about the person. Who that person was at when the story starts. Do not mention any important life changes, like getting married, dying, getting a promotion, unless it happened at the very moment we learn about that person. +Maybe when we first meet the character he is working on his car in his garage. If in the rest of his book he does that from time to time, or we learn he is a mechanic, or a driver, or whatever like that, that's important detail. But if he is not mentioned in the context of cars again, that is irrelevant detail. No spoilers! Do not mention how things end or who they become. Only the most generic but relevant information. So skip anything that's surprising or important action that happened later in the book, but build the background about the person. Who that person was when the story starts. Do not mention any important life changes, like getting married, dying, getting a promotion, unless it happened at the very moment we learn about that person. ## **Output Goal: Character-Centric History** @@ -39,7 +39,6 @@ Return the _final, complete_ results after processing _all_ chapters in the foll - Focus on reminding the reader who the character _is_ based on past context (role, relationships, key history), not what they _do_ or _say_ in the current chapter (N). Avoid spoilers! - Do not write more than 1-2 short sentences about the person. This is not a summary of a book, this is a memory-jog to quickly get someone to connect character name with the actual character. - If the person is known by two names, add the second one in the parentheses. -- This is not a summary of a book, this is a memory-jog to quickly get someone to connect character name with the actual character. - Jeśli tekst jest po Polsku, odpowiedz po Polsku ## Book text diff --git a/apps/pipeline/src/tools/new-tooling/get-reference-cards-for-whole-book.ts b/apps/pipeline/src/tools/new-tooling/get-reference-cards-for-whole-book.ts index ccd9e92e..df43dc85 100644 --- a/apps/pipeline/src/tools/new-tooling/get-reference-cards-for-whole-book.ts +++ b/apps/pipeline/src/tools/new-tooling/get-reference-cards-for-whole-book.ts @@ -4,7 +4,7 @@ import { type NewReferenceCardsResponse } from "../../types"; import { NewReferenceCardsResponseSchema } from "../../schemes"; import { getChaptersUpTo } from "../../helpers/getChaptersUpTo"; import { getBookSettings } from "../../helpers/getBookSettings"; -import { callGrokWithSchema } from "../../callGrok"; +import { callGeminiWithThinkingAndSchemaAndParsed } from "src/callFastGemini"; export const getReferenceCardsForWholeBook = async (): Promise => { const booksSettings = getBookSettings(); @@ -52,7 +52,20 @@ ${knownCharactersMapped}\n\n` console.log("combinedPrompt length:", combinedPrompt.length); - return callGrokWithSchema(combinedPrompt, NewReferenceCardsResponseSchema); + const response = await callGeminiWithThinkingAndSchemaAndParsed( + combinedPrompt, + NewReferenceCardsResponseSchema, + ); + + // Add synthetic generic-avatar for unknown/minor speakers + // This will get an avatar generated but won't be passed to the rewrite prompts + response.characters.push({ + name: "generic-avatar", + referenceCard: + "A mysterious figure shown from behind or in silhouette. No distinct facial features visible. Enigmatic and anonymous, suitable for representing any unnamed character. Atmospheric lighting with the figure partially obscured by shadow or mist.", + }); + + return response; }; if (require.main === module) { diff --git a/apps/pipeline/src/tools/new-tooling/restore-unwrapped-blocks.spec.ts b/apps/pipeline/src/tools/new-tooling/restore-unwrapped-blocks.spec.ts new file mode 100644 index 00000000..9fa1dabf --- /dev/null +++ b/apps/pipeline/src/tools/new-tooling/restore-unwrapped-blocks.spec.ts @@ -0,0 +1,36 @@ +import { describe, expect, it } from "vitest"; +import { restoreUnwrappedBlocks } from "./restore-unwrapped-blocks"; + +describe("restoreUnwrappedBlocks", () => { + it("wraps dangling text and inline nodes using the original block element", () => { + const original = "

Alpha

Miss Howard nodded grimly.

Omega

"; + const model = "

Alpha

Miss Howard nodded grimly.

Omega

"; + + expect(restoreUnwrappedBlocks(original, model)).toBe( + "

Alpha

Miss Howard nodded grimly.

Omega

", + ); + }); + + it("preserves original attributes when rewrapping", () => { + const original = '

First

Second

'; + const model = '

First

Second'; + + expect(restoreUnwrappedBlocks(original, model)).toBe( + '

First

Second

', + ); + }); + + it("leaves unmatched dangling text untouched", () => { + const original = "

One

Two

"; + const model = "

One

Extra

Two

"; + + expect(restoreUnwrappedBlocks(original, model)).toBe("

One

Extra

Two

"); + }); + + it("returns the input when no dangling nodes exist", () => { + const original = "

One

Two

"; + const model = "

One

Two

"; + + expect(restoreUnwrappedBlocks(original, model)).toBe(model); + }); +}); diff --git a/apps/pipeline/src/tools/new-tooling/restore-unwrapped-blocks.ts b/apps/pipeline/src/tools/new-tooling/restore-unwrapped-blocks.ts new file mode 100644 index 00000000..f3b03c2b --- /dev/null +++ b/apps/pipeline/src/tools/new-tooling/restore-unwrapped-blocks.ts @@ -0,0 +1,166 @@ +import { DOMParser, XMLSerializer } from "@xmldom/xmldom"; +import type { Element as XMLElement, Node as XMLNode } from "@xmldom/xmldom"; + +type OriginalBlock = { element: XMLElement; tagName: string; normalizedText: string }; + +function normalizeText(text: string): string { + return text.replace(/\s+/g, " ").trim(); +} + +function getTextContent(node: XMLNode): string { + if (node.nodeType === node.TEXT_NODE) { + return node.nodeValue ?? ""; + } + if (node.nodeType === node.ELEMENT_NODE && node.childNodes) { + let text = ""; + for (let i = 0; i < node.childNodes.length; i++) { + text += getTextContent(node.childNodes[i]); + } + return text; + } + return ""; +} + +function getElementAttributes(element: XMLElement): Record { + const attrs: Record = {}; + if (!element.attributes) return attrs; + for (let i = 0; i < element.attributes.length; i++) { + const attr = element.attributes.item(i); + if (attr?.name) { + attrs[attr.name] = attr.value ?? ""; + } + } + return attrs; +} + +function buildAttributeString(attributes: Record): string { + const entries = Object.entries(attributes); + if (entries.length === 0) return ""; + return " " + entries.map(([key, value]) => `${key}="${value.replace(/"/g, """)}"`).join(" "); +} + +function stripXmlns(serialized: string): string { + return serialized.replace(/\s+xmlns="http:\/\/www\.w3\.org\/1999\/xhtml"/g, ""); +} + +function serializeNodes(nodes: XMLNode[], serializer: XMLSerializer): string { + return nodes.map((node) => stripXmlns(serializer.serializeToString(node))).join(""); +} + +function extractOriginalBlocks(section: XMLElement): OriginalBlock[] { + const blocks: OriginalBlock[] = []; + for (let i = 0; i < section.childNodes.length; i++) { + const node = section.childNodes[i]; + if (node.nodeType !== node.ELEMENT_NODE) continue; + const element = node as XMLElement; + const tagName = (element.tagName || "").toLowerCase(); + const normalizedText = normalizeText(getTextContent(element)); + blocks.push({ element, tagName, normalizedText }); + } + return blocks; +} + +function findMatchingOriginalIndex( + blocks: OriginalBlock[], + startIndex: number, + normalizedText: string, +): number { + if (!normalizedText) return -1; + for (let i = startIndex; i < blocks.length; i++) { + if (blocks[i].normalizedText === normalizedText) { + return i; + } + } + return -1; +} + +function wrapDanglingNodes( + originalElement: XMLElement, + danglingNodes: XMLNode[], + serializer: XMLSerializer, +): string { + const tagName = (originalElement.tagName || "").toLowerCase(); + const attrs = buildAttributeString(getElementAttributes(originalElement)); + const inner = serializeNodes(danglingNodes, serializer); + return `<${tagName}${attrs}>${inner}`; +} + +export function restoreUnwrappedBlocks(originalHtml: string, modelHtml: string): string { + if (originalHtml === modelHtml) return modelHtml; + + const parser = new DOMParser(); + const originalDoc = parser.parseFromString(`
${originalHtml}
`, "text/html"); + const modelDoc = parser.parseFromString(`
${modelHtml}
`, "text/html"); + + const originalSection = originalDoc.getElementsByTagName("section")[0]; + const modelSection = modelDoc.getElementsByTagName("section")[0]; + if (!originalSection || !modelSection) return modelHtml; + + const originalBlocks = extractOriginalBlocks(originalSection); + if (originalBlocks.length === 0) return modelHtml; + + const allowedTags = new Set(originalBlocks.map((block) => block.tagName)); + const serializer = new XMLSerializer(); + + const output: string[] = []; + let danglingNodes: XMLNode[] = []; + let changed = false; + let originalIndex = 0; + + const flushDangling = () => { + if (danglingNodes.length === 0) return; + + const danglingText = normalizeText(danglingNodes.map((node) => getTextContent(node)).join("")); + const matchIndex = findMatchingOriginalIndex(originalBlocks, originalIndex, danglingText); + + if (matchIndex >= 0) { + output.push(wrapDanglingNodes(originalBlocks[matchIndex].element, danglingNodes, serializer)); + originalIndex = matchIndex + 1; + changed = true; + } else { + output.push(serializeNodes(danglingNodes, serializer)); + } + + danglingNodes = []; + }; + + for (let i = 0; i < modelSection.childNodes.length; i++) { + const node = modelSection.childNodes[i]; + + if (node.nodeType === node.TEXT_NODE) { + const text = node.nodeValue ?? ""; + if (normalizeText(text) === "") { + continue; + } + danglingNodes.push(node); + continue; + } + + if (node.nodeType === node.ELEMENT_NODE) { + const element = node as XMLElement; + const tagName = (element.tagName || "").toLowerCase(); + + if (!allowedTags.has(tagName)) { + danglingNodes.push(node); + continue; + } + + flushDangling(); + + // Keep valid block element as-is + output.push(stripXmlns(serializer.serializeToString(element))); + + const elementText = normalizeText(getTextContent(element)); + const matchIndex = findMatchingOriginalIndex(originalBlocks, originalIndex, elementText); + if (matchIndex >= 0) { + originalIndex = matchIndex + 1; + } + continue; + } + } + + flushDangling(); + + if (!changed) return modelHtml; + return output.join(""); +} diff --git a/apps/pipeline/src/tools/new-tooling/restore-unwrapped-lines.repro.spec.ts b/apps/pipeline/src/tools/new-tooling/restore-unwrapped-lines.repro.spec.ts new file mode 100644 index 00000000..c9cf9079 --- /dev/null +++ b/apps/pipeline/src/tools/new-tooling/restore-unwrapped-lines.repro.spec.ts @@ -0,0 +1,51 @@ +import { describe, expect, it } from "vitest"; +import { DOMParser } from "@xmldom/xmldom"; +import { existsSync, readFileSync } from "fs"; +import path from "path"; +import { restoreUnwrappedLines } from "./restore-unwrapped-lines"; + +const baseDir = path.resolve( + __dirname, + "../../../books-data/a-a-milne_the-red-house-mystery/temporary-output", +); + +const reproCases = [ + { + chapter: 11, + broken: "broken-rewritten-paragraphs-for-chapter-11-callGeminiWrapper.xml", + original: "original-paragraphs-for-chapter-11.xml", + }, + { + chapter: 18, + broken: "broken-rewritten-paragraphs-for-chapter-18-callGeminiWrapper.xml", + original: "original-paragraphs-for-chapter-18.xml", + }, +]; + +describe("restoreUnwrappedLines repro cases", () => { + if (!existsSync(baseDir)) { + // eslint-disable-next-line vitest/no-disabled-tests + it.skip("requires books-data for a-a-milne_the-red-house-mystery", () => {}); + return; + } + + for (const repro of reproCases) { + it(`repairs structural XML issues for chapter ${repro.chapter}`, () => { + const brokenPath = path.join(baseDir, repro.broken); + const originalPath = path.join(baseDir, repro.original); + + if (!existsSync(brokenPath) || !existsSync(originalPath)) { + return; + } + + const brokenXml = readFileSync(brokenPath, "utf-8"); + const originalXml = readFileSync(originalPath, "utf-8"); + const restored = restoreUnwrappedLines(originalXml, brokenXml); + + const parser = new DOMParser(); + const doc = parser.parseFromString(`${restored}`, "text/html"); + const parserErrors = doc.getElementsByTagName("parsererror"); + expect(parserErrors.length).toBe(0); + }); + } +}); diff --git a/apps/pipeline/src/tools/new-tooling/restore-unwrapped-lines.spec.ts b/apps/pipeline/src/tools/new-tooling/restore-unwrapped-lines.spec.ts new file mode 100644 index 00000000..efd90720 --- /dev/null +++ b/apps/pipeline/src/tools/new-tooling/restore-unwrapped-lines.spec.ts @@ -0,0 +1,242 @@ +import { describe, expect, it } from "vitest"; +import { restoreUnwrappedLines } from "./restore-unwrapped-lines"; + +describe("restoreUnwrappedLines", () => { + it("wraps a bare text line using the original

wrapper", () => { + const original = [ + "

One.

", + '

Two Evelyn.

', + "

Miss Howard nodded grimly.

", + ].join("\n"); + + const model = [ + "

One.

", + '

Two Evelyn.

', + "Miss Howard nodded grimly.", + ].join("\n"); + + const expected = [ + "

One.

", + '

Two Evelyn.

', + "

Miss Howard nodded grimly.

", + ].join("\n"); + + expect(restoreUnwrappedLines(original, model)).toBe(expected); + }); + + it("preserves indentation when wrapping", () => { + const original = ["

Intro.

", "

Indented line.

"].join("\n"); + + const model = ["

Intro.

", " Indented line."].join("\n"); + + const expected = ["

Intro.

", "

Indented line.

"].join("\n"); + + expect(restoreUnwrappedLines(original, model)).toBe(expected); + }); + + it("repairs a line that closes a

without opening it", () => { + const original = [ + "

'Then you must know where you found it?'

", + "

'Yes, it was on the prisoner's wardrobe.'

", + "

'That is better.'

", + ].join("\n"); + + const model = [ + "

'Then you must know where you found it?'

", + "'Yes, it was on the prisoner's wardrobe.'

", + "

'That is better.'

", + ].join("\n"); + + const expected = [ + "

'Then you must know where you found it?'

", + "

'Yes, it was on the prisoner's wardrobe.'

", + "

'That is better.'

", + ].join("\n"); + + expect(restoreUnwrappedLines(original, model)).toBe(expected); + }); + + it("repairs orphan

when model output is a single line", () => { + const original = [ + "

'Then you must know where you found it?'

", + "

'Yes, it was on the prisoner's wardrobe.'

", + "

'That is better.'

", + ].join("\n"); + + const model = + "

'Then you must know where you found it?'

" + + "'Yes, it was on the prisoner's wardrobe.'

" + + "

'That is better.'

"; + + const expected = [ + "

'Then you must know where you found it?'

", + "

'Yes, it was on the prisoner's wardrobe.'

", + "

'That is better.'

", + ].join("\n"); + + expect(restoreUnwrappedLines(original, model)).toBe(expected); + }); + + it("wraps multiple consecutive bare lines using original wrappers", () => { + const original = ["

One.

", "

Two.

", "

Three.

"].join("\n"); + + const model = ["

One.

", "Two.", "Three."].join("\n"); + + const expected = ["

One.

", "

Two.

", "

Three.

"].join("\n"); + + expect(restoreUnwrappedLines(original, model)).toBe(expected); + }); + + it("wraps a bare line with inline tags and trailing

", () => { + const original = ["

She saw the prisoner.

", "

It was unexpected.

"].join("\n"); + + const model = [ + "

She saw the prisoner.

", + ' It was unexpected.

', + ].join("\n"); + + const expected = [ + "

She saw the prisoner.

", + '

It was unexpected.

', + ].join("\n"); + + expect(restoreUnwrappedLines(original, model)).toBe(expected); + }); + + it("keeps concatenated

tags on a single line unchanged", () => { + const original = ["

One.

", "

Two.

"].join("\n"); + const model = "

One.

Two.

"; + + expect(restoreUnwrappedLines(original, model)).toBe(model); + }); + + it("closes an opening

when the next line starts a new

", () => { + const original = [ + "

'Did you not find it yourself?'

", + "

'Yes.'

", + "

'Then you must know where you found it?'

", + ].join("\n"); + + const model = [ + "

'Did you not find it yourself?'

", + "

'Yes.'", + "

'Then you must know where you found it?'

", + ].join("\n"); + + const expected = [ + "

'Did you not find it yourself?'

", + "

'Yes.'

", + "

'Then you must know where you found it?'

", + ].join("\n"); + + expect(restoreUnwrappedLines(original, model)).toBe(expected); + }); + + it("merges a multi-line paragraph when closing

is on the next line", () => { + const original = [ + '

He explained his reasons.

', + "

Next paragraph.

", + ].join("\n"); + + const model = [ + '

He explained his reasons and went on:', + "My theory is that it was obvious.

", + "

Next paragraph.

", + ].join("\n"); + + const expected = [ + '

He explained his reasons and went on: My theory is that it was obvious.

', + "

Next paragraph.

", + ].join("\n"); + + expect(restoreUnwrappedLines(original, model)).toBe(expected); + }); + + it("closes a paragraph at EOF when the closing tag is missing", () => { + const original = ["

First.

", "

Second.

"].join("\n"); + + const model = ["

First.

", "

Second."].join("\n"); + + const expected = ["

First.

", "

Second.

"].join("\n"); + + expect(restoreUnwrappedLines(original, model)).toBe(expected); + }); + + it("merges a multi-line paragraph without a closing tag before the next

", () => { + const original = ["

Alpha beta.

", "

Gamma.

"].join("\n"); + + const model = ["

Alpha", "beta.", "

Gamma.

"].join("\n"); + + const expected = ["

Alpha beta.

", "

Gamma.

"].join("\n"); + + expect(restoreUnwrappedLines(original, model)).toBe(expected); + }); + + it("moves stray punctuation before a

tag inside the paragraph", () => { + const original = [ + '

“Robert Ablett.”

', + "

Next line.

", + ].join("\n"); + + const model = [ + '“

Robert Ablett.”

', + "

Next line.

", + ].join("\n"); + + const expected = [ + '

Robert Ablett.”

', + "

Next line.

", + ].join("\n"); + + expect(restoreUnwrappedLines(original, model)).toBe(expected); + }); + + it("splits a line when a new

starts mid-line", () => { + const original = ["

Bill looked at him.

", "

Antony took his arm.

"].join("\n"); + + const model = "Bill looked at him.

Antony took his arm.

"; + + const expected = ["

Bill looked at him.

", "

Antony took his arm.

"].join("\n"); + + expect(restoreUnwrappedLines(original, model)).toBe(expected); + }); + + it("wraps restored bare text inserted between two paragraph tags", () => { + const original = [ + "

Once ... He quoted from the Luzumiyat to show that the poet-philosopher was a true Sufi.

", + '

"In his passionate hatred ..."

', + ].join("\n"); + + const model = [ + "

Once ...

", + "He quoted from the Luzumiyat to show that the poet-philosopher was a true Sufi.", + '

"In his passionate hatred ..."

', + ].join("\n"); + + const expected = [ + "

Once ...

", + "

He quoted from the Luzumiyat to show that the poet-philosopher was a true Sufi.

", + '

"In his passionate hatred ..."

', + ].join("\n"); + + expect(restoreUnwrappedLines(original, model)).toBe(expected); + }); + + it("wraps restored bare text when it is inline between

and the next

", () => { + const original = [ + "

Once ... He quoted from the Luzumiyat to show that the poet-philosopher was a true Sufi.

", + '

"In his passionate hatred ..."

', + ].join("\n"); + + const model = + '

Once ...

He quoted from the Luzumiyat to show that the poet-philosopher was a true Sufi.

"In his passionate hatred ..."

'; + + const expected = [ + "

Once ...

", + "

He quoted from the Luzumiyat to show that the poet-philosopher was a true Sufi.

", + '

"In his passionate hatred ..."

', + ].join("\n"); + + expect(restoreUnwrappedLines(original, model)).toBe(expected); + }); +}); diff --git a/apps/pipeline/src/tools/new-tooling/restore-unwrapped-lines.ts b/apps/pipeline/src/tools/new-tooling/restore-unwrapped-lines.ts new file mode 100644 index 00000000..9f29ac83 --- /dev/null +++ b/apps/pipeline/src/tools/new-tooling/restore-unwrapped-lines.ts @@ -0,0 +1,255 @@ +import { DOMParser } from "@xmldom/xmldom"; + +type OriginalBlock = { openTag: string; closeTag: string; normalizedText: string }; + +function normalizeText(text: string): string { + return text.replace(/\s+/g, " ").trim(); +} + +function splitInlineParagraphTags(lines: string[]): { lines: string[]; changed: boolean } { + const result: string[] = []; + let changed = false; + + for (const line of lines) { + let remaining = line; + while (true) { + const index = remaining.search(/ 0 && /^[“”‘’"'.,;:!?…—–-]+$/.test(prefixTrimmed); + + if (prefixIsWhitespace || prefixIsPunctuation) { + const openEnd = rest.indexOf(">"); + if (openEnd !== -1) { + const prefixInside = prefixTrimmed; + const injected = `${rest.slice(0, openEnd + 1)}${prefixInside}${rest.slice(openEnd + 1)}`; + result.push(injected); + if (injected !== line) { + changed = true; + } + } else { + result.push(rest); + if (rest !== line) { + changed = true; + } + } + break; + } + + result.push(prefix.trimEnd()); + remaining = rest; + changed = true; + } + } + + return { lines: result, changed }; +} + +function splitInlineParagraphBoundaries(lines: string[]): { lines: string[]; changed: boolean } { + const result: string[] = []; + let changed = false; + + for (const line of lines) { + const rewritten = line.replace(/<\/p>\s+/gi, "

\n"); + + if (rewritten !== line) { + changed = true; + } + + result.push(...rewritten.split("\n")); + } + + return { lines: result, changed }; +} + +function getTextContent(html: string, parser: DOMParser): string { + const doc = parser.parseFromString(`

${html}

`, "text/html"); + const p = doc.getElementsByTagName("p")[0]; + return p?.textContent ?? ""; +} + +function extractOriginalBlocks(originalInner: string): OriginalBlock[] { + const parser = new DOMParser(); + const blocks: OriginalBlock[] = []; + const lines = originalInner.split(/\r?\n/); + + for (const line of lines) { + const trimmed = line.trim(); + if (!trimmed.startsWith(""); + const closeStart = trimmed.lastIndexOf("

"); + if (openEnd === -1 || closeStart === -1 || closeStart <= openEnd) continue; + + const openTag = trimmed.slice(0, openEnd + 1); + const innerHtml = trimmed.slice(openEnd + 1, closeStart); + const normalizedText = normalizeText(getTextContent(innerHtml, parser)); + + blocks.push({ openTag, closeTag: "

", normalizedText }); + } + + return blocks; +} + +function findMatchingIndex( + blocks: OriginalBlock[], + startIndex: number, + normalizedText: string, +): number { + if (!normalizedText) return -1; + for (let i = startIndex; i < blocks.length; i += 1) { + if (blocks[i].normalizedText === normalizedText) return i; + } + return -1; +} + +// eslint-disable-next-line complexity +export function restoreUnwrappedLines(originalInner: string, modelInner: string): string { + if (originalInner === modelInner) return modelInner; + + const blocks = extractOriginalBlocks(originalInner); + if (blocks.length === 0) return modelInner; + + const parser = new DOMParser(); + let normalizedModel = modelInner.includes("\n") + ? modelInner + : modelInner.replace(/<\/p>\s*/gi, "

\n").replace(/\s*= 0 && !lines[prevNonEmptyIndex].trim()) { + prevNonEmptyIndex -= 1; + } + const prevLine = lines[prevNonEmptyIndex] ?? ""; + + let nextNonEmptyIndex = i + 1; + while (nextNonEmptyIndex < lines.length && !lines[nextNonEmptyIndex].trim()) { + nextNonEmptyIndex += 1; + } + const nextLine = lines[nextNonEmptyIndex] ?? ""; + + if (!trimmed) { + output.push(line); + continue; + } + + if (trimmed.startsWith("")) { + let merged = line; + let consumedTo = i; + let foundClose = false; + let appended = false; + + for (let j = i + 1; j < lines.length; j += 1) { + const candidate = lines[j]; + const candidateTrimmed = candidate.trim(); + if (!candidateTrimmed) { + continue; + } + if (candidateTrimmed.startsWith("")) { + break; + } + + const isBlockBoundary = + /^(<\/?(section|h\d|hgroup|blockquote|div|ul|ol|li|table|figure)\b)/i.test( + candidateTrimmed, + ); + if (isBlockBoundary) { + break; + } + + merged = `${merged} ${candidateTrimmed}`; + appended = true; + consumedTo = j; + if (candidateTrimmed.includes("

")) { + foundClose = true; + break; + } + } + + if (appended) { + if (!foundClose) { + merged = `${merged}

`; + } + output.push(merged); + changed = true; + i = consumedTo; + continue; + } + + const nextTrimmed = nextLine.trim(); + if ( + nextNonEmptyIndex >= lines.length || + nextTrimmed.startsWith("") + ) { + line = `${line}

`; + changed = true; + } + } + + if (trimmed.startsWith("<")) { + output.push(line); + continue; + } + + let lineContent = trimmed; + const closingPMatch = lineContent.match(/<\/p>\s*$/i); + if (closingPMatch) { + lineContent = lineContent.slice(0, closingPMatch.index).trimEnd(); + } + + const normalizedText = normalizeText(getTextContent(lineContent, parser)); + const matchIndex = findMatchingIndex(blocks, originalIndex, normalizedText); + + if (matchIndex >= 0) { + const indentMatch = line.match(/^\s*/); + const indent = indentMatch ? indentMatch[0] : ""; + const block = blocks[matchIndex]; + output.push(`${indent}${block.openTag}${lineContent}${block.closeTag}`); + originalIndex = matchIndex + 1; + changed = true; + continue; + } + + const prevTrimmed = prevLine.trim(); + const nextTrimmed = nextLine.trim(); + const isIsolatedBetweenParagraphs = + prevNonEmptyIndex >= 0 && + nextNonEmptyIndex < lines.length && + prevTrimmed.endsWith("

") && + nextTrimmed.startsWith("${lineContent}

`); + changed = true; + continue; + } + + output.push(line); + } + + const joined = output.join("\n"); + if (!changed) return modelInner; + return modelInner.endsWith("\n") ? `${joined}\n` : joined; +} diff --git a/apps/pipeline/src/tools/new-tooling/sanitize-nested-paragraphs.spec.ts b/apps/pipeline/src/tools/new-tooling/sanitize-nested-paragraphs.spec.ts new file mode 100644 index 00000000..69774937 --- /dev/null +++ b/apps/pipeline/src/tools/new-tooling/sanitize-nested-paragraphs.spec.ts @@ -0,0 +1,30 @@ +import { describe, expect, it } from "vitest"; +import { sanitizeNestedParagraphs } from "./sanitize-nested-paragraphs"; + +describe("sanitizeNestedParagraphs", () => { + it("removes nested

tags inside a paragraph and keeps inner text", () => { + const input = + '

Alpha

\u201cOho!\u201d said the board.

Omega

'; + + const expected = "

Alpha \u201cOho!\u201d said the board. Omega

"; + + expect(sanitizeNestedParagraphs(input)).toBe(expected); + }); + + it("leaves normal paragraphs unchanged", () => { + const input = "

One.

\n

Two.

"; + expect(sanitizeNestedParagraphs(input)).toBe(input); + }); + + it("handles multiple nested paragraphs in one block", () => { + const input = '

Start

A

mid

B

end

'; + const expected = "

Start A mid B end

"; + expect(sanitizeNestedParagraphs(input)).toBe(expected); + }); + + it("repairs malformed
sequences by closing the
tag", () => { + const input = '

Line one
Indented line
Next line

'; + const expected = '

Line one
Indented line
Next line

'; + expect(sanitizeNestedParagraphs(input)).toBe(expected); + }); +}); diff --git a/apps/pipeline/src/tools/new-tooling/sanitize-nested-paragraphs.ts b/apps/pipeline/src/tools/new-tooling/sanitize-nested-paragraphs.ts new file mode 100644 index 00000000..9001a2e5 --- /dev/null +++ b/apps/pipeline/src/tools/new-tooling/sanitize-nested-paragraphs.ts @@ -0,0 +1,51 @@ +export function sanitizeNestedParagraphs(html: string): string { + html = html.replace(/ "); + let depth = 0; + let out = ""; + let i = 0; + + while (i < html.length) { + const char = html[i]; + if (char !== "<") { + out += char; + i += 1; + continue; + } + + const closeIdx = html.indexOf(">", i); + if (closeIdx === -1) { + out += html.slice(i); + break; + } + + const tag = html.slice(i, closeIdx + 1); + const lower = tag.toLowerCase(); + + if (lower.startsWith("= 1) { + // Nested

-> drop tag, keep content inline + } else { + out += tag; + } + depth += 1; + i = closeIdx + 1; + continue; + } + + if (lower.startsWith(" 1) { + // Closing nested

-> drop tag + } else { + out += tag; + } + depth = Math.max(0, depth - 1); + i = closeIdx + 1; + continue; + } + + out += tag; + i = closeIdx + 1; + } + + return out; +} diff --git a/apps/pipeline/src/tools/new-tooling/section-wrapper.spec.ts b/apps/pipeline/src/tools/new-tooling/section-wrapper.spec.ts new file mode 100644 index 00000000..9bebbc68 --- /dev/null +++ b/apps/pipeline/src/tools/new-tooling/section-wrapper.spec.ts @@ -0,0 +1,37 @@ +import { describe, expect, it } from "vitest"; +import { buildSectionWrapper, extractSectionInner, parseAttributes } from "./section-wrapper"; + +describe("section-wrapper", () => { + it("parses attributes with quotes", () => { + const attrs = parseAttributes(' data-chapter="2" data-epub-type="chapter"'); + expect(attrs).toEqual({ "data-chapter": "2", "data-epub-type": "chapter" }); + }); + + it("parses attributes with mixed quotes and unquoted values", () => { + const attrs = parseAttributes(" data-id='x' data-num=3 data-flag "); + expect(attrs).toEqual({ "data-id": "x", "data-num": "3", "data-flag": "" }); + }); + + it("extracts section inner and wrapper", () => { + const html = '

Hi

'; + const result = extractSectionInner(html); + expect(result.inner).toBe("

Hi

"); + expect(result.wrapper).toEqual({ + tagName: "section", + attributes: { "data-chapter": "2", "data-epub-type": "chapter" }, + }); + }); + + it("returns original text when no section wrapper", () => { + const html = "

Hi

"; + const result = extractSectionInner(html); + expect(result.inner).toBe(html); + expect(result.wrapper).toBeNull(); + }); + + it("rebuilds section wrapper with attributes", () => { + const html = '

Hi

'; + const result = extractSectionInner(html); + expect(buildSectionWrapper(result.inner, result.wrapper)).toBe(html); + }); +}); diff --git a/apps/pipeline/src/tools/new-tooling/section-wrapper.ts b/apps/pipeline/src/tools/new-tooling/section-wrapper.ts new file mode 100644 index 00000000..447c42be --- /dev/null +++ b/apps/pipeline/src/tools/new-tooling/section-wrapper.ts @@ -0,0 +1,73 @@ +export type SectionWrapper = { tagName: string; attributes: Record }; + +export type SectionExtract = { inner: string; wrapper: SectionWrapper | null }; + +function isNameChar(char: string): boolean { + return /[A-Za-z0-9_:-]/.test(char); +} + +export function parseAttributes(raw: string): Record { + const attrs: Record = {}; + let i = 0; + + while (i < raw.length) { + while (i < raw.length && /\s/.test(raw[i])) i++; + if (i >= raw.length) break; + + let name = ""; + while (i < raw.length && isNameChar(raw[i])) { + name += raw[i]; + i += 1; + } + + if (!name) break; + + while (i < raw.length && /\s/.test(raw[i])) i++; + let value = ""; + + if (raw[i] === "=") { + i += 1; + while (i < raw.length && /\s/.test(raw[i])) i++; + + const quote = raw[i]; + if (quote === '"' || quote === "'") { + i += 1; + const start = i; + while (i < raw.length && raw[i] !== quote) i++; + value = raw.slice(start, i); + if (raw[i] === quote) i += 1; + } else { + const start = i; + while (i < raw.length && !/\s|>/.test(raw[i])) i++; + value = raw.slice(start, i); + } + } + + attrs[name] = value; + } + + return attrs; +} + +function buildAttributeString(attributes: Record): string { + const entries = Object.entries(attributes); + if (entries.length === 0) return ""; + return ( + " " + entries.map(([key, value]) => `${key}="${value.replace(/\\"/g, """)}"`).join(" ") + ); +} + +export function extractSectionInner(html: string): SectionExtract { + const match = html.match(/^\s*]*)>([\s\S]*)<\/section>\s*$/i); + if (!match) { + return { inner: html, wrapper: null }; + } + + const attributes = parseAttributes(match[1] ?? ""); + return { inner: match[2] ?? "", wrapper: { tagName: "section", attributes } }; +} + +export function buildSectionWrapper(inner: string, wrapper: SectionWrapper | null): string { + if (!wrapper) return inner; + return `<${wrapper.tagName}${buildAttributeString(wrapper.attributes)}>${inner}`; +} diff --git a/apps/pipeline/src/tools/new-tooling/summaryParagraphs.spec.ts b/apps/pipeline/src/tools/new-tooling/summaryParagraphs.spec.ts new file mode 100644 index 00000000..c262ef30 --- /dev/null +++ b/apps/pipeline/src/tools/new-tooling/summaryParagraphs.spec.ts @@ -0,0 +1,27 @@ +import { describe, it, expect } from "vitest"; +import { buildParagraphsForSummary } from "./summaryParagraphs"; + +describe("buildParagraphsForSummary", () => { + it("wraps each paragraph in a

with the data index", () => { + const output = buildParagraphsForSummary([ + { dataIndex: 0, text: "Hello world" }, + { dataIndex: 1, text: "Second paragraph" }, + ]); + + expect(output).toBe('

Hello world

\n

Second paragraph

'); + }); + + it("preserves double quotes in text content", () => { + const output = buildParagraphsForSummary([{ dataIndex: 2, text: 'He said "hello" and left.' }]); + + expect(output).toContain('He said "hello" and left.'); + }); + + it("keeps embedded HTML attributes double-quoted", () => { + const output = buildParagraphsForSummary([ + { dataIndex: 3, text: 'Mrs. Inglethorp\'s bedroom' }, + ]); + + expect(output).toContain('alt="Mrs. Inglethorp\'s bedroom"'); + }); +}); diff --git a/apps/pipeline/src/tools/new-tooling/summaryParagraphs.ts b/apps/pipeline/src/tools/new-tooling/summaryParagraphs.ts new file mode 100644 index 00000000..88a323b2 --- /dev/null +++ b/apps/pipeline/src/tools/new-tooling/summaryParagraphs.ts @@ -0,0 +1,5 @@ +export function buildParagraphsForSummary(paragraphs: { text: string; dataIndex: number }[]) { + return paragraphs + .map((paragraph) => `

${paragraph.text.trim()}

`) + .join("\n"); +} diff --git a/apps/pipeline/src/tools/se-converter/drama-classifier.ts b/apps/pipeline/src/tools/se-converter/drama-classifier.ts new file mode 100644 index 00000000..888e39ab --- /dev/null +++ b/apps/pipeline/src/tools/se-converter/drama-classifier.ts @@ -0,0 +1,161 @@ +import { readdir } from "node:fs/promises"; +import { join } from "node:path"; + +export type BookCategory = "FULL_PLAY" | "EMBEDDED_DRAMA" | "DIALOGUE_ONLY"; + +export interface BookAnalysis { + slug: string; + category: BookCategory; + hasDramatisPersonae: boolean; + hasActFiles: boolean; + hasSceneSections: boolean; + hasDramaTables: boolean; + hasBodyDramaType: boolean; + actCount: number; + dramaTables: number; + regularParagraphs: number; + notes: string[]; +} + +const DEFAULT_BOOKS_DIR = join(import.meta.dir, "../../../standardebooks-data/books"); + +function includesDramaBodyType(content: string): boolean { + const bodyMatch = content.match(/]*epub:type=(['"])([^'"]+)\1/); + if (!bodyMatch) return false; + return bodyMatch[2].split(/\s+/).includes("z3998:drama"); +} + +// eslint-disable-next-line complexity +export async function analyzeBook( + bookSlug: string, + booksDir: string = DEFAULT_BOOKS_DIR, +): Promise { + const textDir = join(booksDir, bookSlug, "text"); + + try { + const files = await readdir(textDir); + const xhtmlFiles = files.filter((f) => f.endsWith(".xhtml")); + + const analysis: BookAnalysis = { + slug: bookSlug, + category: "DIALOGUE_ONLY", + hasDramatisPersonae: files.includes("dramatis-personae.xhtml"), + hasActFiles: files.some((f) => /^act-\d+\.xhtml$/.test(f)), + hasSceneSections: false, + hasDramaTables: false, + hasBodyDramaType: false, + actCount: files.filter((f) => /^act-\d+\.xhtml$/.test(f)).length, + dramaTables: 0, + regularParagraphs: 0, + notes: [], + }; + + let hasAnyDramaContent = false; + + for (const file of xhtmlFiles) { + const content = await Bun.file(join(textDir, file)).text(); + + if (!analysis.hasBodyDramaType && includesDramaBodyType(content)) { + analysis.hasBodyDramaType = true; + hasAnyDramaContent = true; + } + + const dramaTableMatches = content.match(/]*epub:type="z3998:drama"/g); + if (dramaTableMatches) { + analysis.dramaTables += dramaTableMatches.length; + analysis.hasDramaTables = true; + hasAnyDramaContent = true; + } + + if ( + content.includes('epub:type="z3998:scene"') || + content.includes("epub:type='z3998:scene'") + ) { + analysis.hasSceneSections = true; + hasAnyDramaContent = true; + } + + if ( + ![ + "dramatis-personae.xhtml", + "endnotes.xhtml", + "colophon.xhtml", + "imprint.xhtml", + "titlepage.xhtml", + "halftitlepage.xhtml", + ].includes(file) + ) { + const paragraphs = content.match(/]*>/g); + if (paragraphs) { + analysis.regularParagraphs += paragraphs.length; + } + } + + if (content.includes('epub:type="z3998:stage-direction"')) { + hasAnyDramaContent = true; + } + } + + if ( + !hasAnyDramaContent && + !analysis.hasDramatisPersonae && + !analysis.hasActFiles && + !analysis.hasBodyDramaType + ) { + return null; + } + + if ( + analysis.hasDramatisPersonae && + analysis.hasActFiles && + (analysis.hasSceneSections || analysis.hasBodyDramaType) + ) { + analysis.category = "FULL_PLAY"; + analysis.notes.push("Classic play structure"); + } else if (analysis.hasDramatisPersonae && analysis.hasActFiles) { + analysis.category = "FULL_PLAY"; + analysis.notes.push("Has acts and dramatis-personae"); + } else if (analysis.hasSceneSections && analysis.actCount > 0) { + analysis.category = "FULL_PLAY"; + analysis.notes.push("Scene-based structure"); + } else if (analysis.hasBodyDramaType && analysis.hasActFiles) { + analysis.category = "FULL_PLAY"; + analysis.notes.push("Body marked as drama with acts"); + } else if (analysis.hasDramaTables) { + if (analysis.regularParagraphs > 100 && analysis.dramaTables < 20) { + analysis.category = "EMBEDDED_DRAMA"; + analysis.notes.push( + `${analysis.dramaTables} drama tables in ${analysis.regularParagraphs} paragraphs of prose`, + ); + } else if (analysis.dramaTables > 50) { + analysis.category = "FULL_PLAY"; + analysis.notes.push("Primarily drama tables"); + } else { + analysis.category = "EMBEDDED_DRAMA"; + analysis.notes.push(`Mixed content: ${analysis.dramaTables} drama sections`); + } + } else if (analysis.hasDramatisPersonae || analysis.hasBodyDramaType) { + analysis.category = "FULL_PLAY"; + analysis.notes.push("Has dramatis-personae or drama body type"); + } else { + analysis.category = "DIALOGUE_ONLY"; + analysis.notes.push("Uses persona markup but not structured drama"); + } + + return analysis; + } catch { + return null; + } +} + +export async function classifyDramaBooks( + bookSlugs: string[], + booksDir: string = DEFAULT_BOOKS_DIR, +): Promise { + const results: BookAnalysis[] = []; + for (const slug of bookSlugs) { + const analysis = await analyzeBook(slug, booksDir); + if (analysis) results.push(analysis); + } + return results; +} diff --git a/apps/pipeline/src/tools/se-converter/find-drama-books.ts b/apps/pipeline/src/tools/se-converter/find-drama-books.ts index cd5b487b..3344eff8 100644 --- a/apps/pipeline/src/tools/se-converter/find-drama-books.ts +++ b/apps/pipeline/src/tools/se-converter/find-drama-books.ts @@ -10,134 +10,13 @@ import { readdir } from "node:fs/promises"; import { join } from "node:path"; +import { analyzeBook, type BookAnalysis } from "./drama-classifier"; const BOOKS_DIR = join(import.meta.dir, "../../../standardebooks-data/books"); const OUTPUT_FILE = join(import.meta.dir, "../../../list-of-play-books.txt"); type BookCategory = "FULL_PLAY" | "EMBEDDED_DRAMA" | "DIALOGUE_ONLY"; -interface BookAnalysis { - slug: string; - category: BookCategory; - hasDramatisPersonae: boolean; - hasActFiles: boolean; - hasSceneSections: boolean; - hasDramaTables: boolean; - actCount: number; - dramaTables: number; - regularParagraphs: number; - notes: string[]; -} - -// eslint-disable-next-line complexity -async function analyzeBook(bookSlug: string): Promise { - const textDir = join(BOOKS_DIR, bookSlug, "text"); - - try { - const files = await readdir(textDir); - const xhtmlFiles = files.filter((f) => f.endsWith(".xhtml")); - - const analysis: BookAnalysis = { - slug: bookSlug, - category: "DIALOGUE_ONLY", - hasDramatisPersonae: files.includes("dramatis-personae.xhtml"), - hasActFiles: files.some((f) => /^act-\d+\.xhtml$/.test(f)), - hasSceneSections: false, - hasDramaTables: false, - actCount: files.filter((f) => /^act-\d+\.xhtml$/.test(f)).length, - dramaTables: 0, - regularParagraphs: 0, - notes: [], - }; - - let hasAnyDramaContent = false; - - for (const file of xhtmlFiles) { - const content = await Bun.file(join(textDir, file)).text(); - - // Count drama tables (embedded drama sections) - const dramaTableMatches = content.match(/]*epub:type="z3998:drama"/g); - if (dramaTableMatches) { - analysis.dramaTables += dramaTableMatches.length; - analysis.hasDramaTables = true; - hasAnyDramaContent = true; - } - - // Check for scene sections (full play indicator) - if ( - content.includes('epub:type="z3998:scene"') || - content.includes("epub:type='z3998:scene'") - ) { - analysis.hasSceneSections = true; - hasAnyDramaContent = true; - } - - // Count regular paragraphs (outside of drama context) - // Skip dramatis-personae, endnotes, etc - if ( - ![ - "dramatis-personae.xhtml", - "endnotes.xhtml", - "colophon.xhtml", - "imprint.xhtml", - "titlepage.xhtml", - "halftitlepage.xhtml", - ].includes(file) - ) { - const paragraphs = content.match(/]*>/g); - if (paragraphs) { - analysis.regularParagraphs += paragraphs.length; - } - } - - // Check for stage directions (another indicator) - if (content.includes('epub:type="z3998:stage-direction"')) { - hasAnyDramaContent = true; - } - } - - if (!hasAnyDramaContent && !analysis.hasDramatisPersonae && !analysis.hasActFiles) { - return null; // Not a drama book - } - - // Categorize - if (analysis.hasDramatisPersonae && analysis.hasActFiles && analysis.hasSceneSections) { - analysis.category = "FULL_PLAY"; - analysis.notes.push("Classic play structure"); - } else if (analysis.hasDramatisPersonae && analysis.hasActFiles) { - analysis.category = "FULL_PLAY"; - analysis.notes.push("Has acts and dramatis-personae"); - } else if (analysis.hasSceneSections && analysis.actCount > 0) { - analysis.category = "FULL_PLAY"; - analysis.notes.push("Scene-based structure"); - } else if (analysis.hasDramaTables) { - // Has embedded in prose - if (analysis.regularParagraphs > 100 && analysis.dramaTables < 20) { - analysis.category = "EMBEDDED_DRAMA"; - analysis.notes.push( - `${analysis.dramaTables} drama tables in ${analysis.regularParagraphs} paragraphs of prose`, - ); - } else if (analysis.dramaTables > 50) { - analysis.category = "FULL_PLAY"; - analysis.notes.push("Primarily drama tables"); - } else { - analysis.category = "EMBEDDED_DRAMA"; - analysis.notes.push(`Mixed content: ${analysis.dramaTables} drama sections`); - } - } else if (analysis.hasDramatisPersonae) { - analysis.category = "FULL_PLAY"; - analysis.notes.push("Has dramatis-personae only"); - } else { - analysis.category = "DIALOGUE_ONLY"; - analysis.notes.push("Uses persona markup but not structured drama"); - } - - return analysis; - } catch { - return null; - } -} - async function findDramaBooks() { const bookDirs = await readdir(BOOKS_DIR); @@ -146,9 +25,9 @@ async function findDramaBooks() { const dialogueOnly: BookAnalysis[] = []; for (const bookSlug of bookDirs) { - const analysis = await analyzeBook(bookSlug); + const analysis = await analyzeBook(bookSlug, BOOKS_DIR); if (analysis) { - switch (analysis.category) { + switch (analysis.category as BookCategory) { case "FULL_PLAY": fullPlays.push(analysis); break; diff --git a/apps/pipeline/src/tools/se-converter/importSEBook.ts b/apps/pipeline/src/tools/se-converter/importSEBook.ts index fe36a2b2..95d1f582 100644 --- a/apps/pipeline/src/tools/se-converter/importSEBook.ts +++ b/apps/pipeline/src/tools/se-converter/importSEBook.ts @@ -14,10 +14,11 @@ import * as fs from "fs"; import * as path from "path"; +import { JSDOM } from "jsdom"; import { AdminConvexHttpClient } from "../../lib/AdminConvexHttpClient"; import { api } from "@bookgenius/convex/_generated/api"; import { convertSEBook, getSEBookImagesDir, type SEImageReference } from "./index"; -import { JSDOM } from "jsdom"; +import { computeParagraphCount } from "../../lib/paragraphCount"; const SE_BOOKS_DIR = path.resolve(__dirname, "../../../standardebooks-data/books"); const LEGACY_BOOKS_DIR = path.resolve(__dirname, "../../../../../books"); @@ -289,9 +290,7 @@ async function step3_ImportChapters( for (const chapter of chapters) { console.log(` Chapter ${chapter.chapterNumber}: ${chapter.title || "(no title)"}`); - const dom = new JSDOM(chapter.html); - const section = dom.window.document.querySelector("section"); - const paragraphCount = section?.children.length || 0; + const paragraphCount = computeParagraphCount(chapter.html); await client.action(api.chapterCompiler.uploadHtmlSourceChapter, { bookPath, diff --git a/apps/pipeline/src/tools/se-converter/index.test.ts b/apps/pipeline/src/tools/se-converter/index.test.ts index a5d1ecd6..6b820dfd 100644 --- a/apps/pipeline/src/tools/se-converter/index.test.ts +++ b/apps/pipeline/src/tools/se-converter/index.test.ts @@ -1,5 +1,5 @@ import { describe, it, expect } from "vitest"; -import { convertSeXhtmlToHtml, wrapInRichXml } from "./index"; +import { convertSeXhtmlToHtml, wrapInRichXml, assertSeConversionTextCoverage } from "./index"; describe("SE Converter", () => { describe("simple chapters (no nesting)", () => { @@ -65,6 +65,67 @@ describe("SE Converter", () => { result.textHtml.indexOf("Chapter Two"), ); }); + + it("includes all top-level articles in a single file", () => { + const files = [ + { + filename: "poems.xhtml", + content: ` + + +
+

I

+

First poem line.

+
+
+

II

+

Second poem line.

+
+ +`, + }, + ]; + + const result = convertSeXhtmlToHtml(files); + + expect(result.lastChapter).toBe(2); + expect(result.textHtml).toContain('data-chapter="1"'); + expect(result.textHtml).toContain('data-chapter="2"'); + expect(result.textHtml).toContain("First poem line."); + expect(result.textHtml).toContain("Second poem line."); + expect(result.textHtml.indexOf("First poem line.")).toBeLessThan( + result.textHtml.indexOf("Second poem line."), + ); + }); + }); + + describe("text coverage validation", () => { + it("throws when converted HTML drops most of the source text", () => { + const files = [ + { + filename: "poems.xhtml", + content: ` + + +
+

I

+

First poem line one. First poem line two.

+
+
+

II

+

Second poem line one. Second poem line two.

+
+ +`, + }, + ]; + + const truncatedHtml = `

First poem line one.

`; + + expect(() => assertSeConversionTextCoverage(files, truncatedHtml, { minWords: 1 })).toThrow( + /SE conversion text check failed/, + ); + }); }); describe("part dividers (like Triplanetary)", () => { diff --git a/apps/pipeline/src/tools/se-converter/index.ts b/apps/pipeline/src/tools/se-converter/index.ts index b7a67546..fe4a3113 100644 --- a/apps/pipeline/src/tools/se-converter/index.ts +++ b/apps/pipeline/src/tools/se-converter/index.ts @@ -1,6 +1,7 @@ import fs from "fs"; import path from "path"; import { JSDOM } from "jsdom"; +import { extractSENotesFromXhtml, rewriteSeNoteRefsToDataNote, type SENote } from "./notes"; export interface SEImageReference { /** Original filename (e.g., "illustration-1.svg") */ @@ -17,6 +18,8 @@ export interface SEConversionResult { lastChapter: number; /** List of images referenced in the book content */ images: SEImageReference[]; + /** Extracted endnotes/footnotes keyed as fnN */ + notes: SENote[]; } // Files to skip - these are not actual book content @@ -101,6 +104,76 @@ function htmlToValidXml(html: string): string { .replace(/<(\/?)(ns\d+|epub):([a-z-]+)/gi, "<$1$3"); } +function normalizeTextForComparison(text: string): string { + return text.toLowerCase().replace(/\s+/g, " ").trim(); +} + +function extractNormalizedTextFromXhtml(content: string): string { + const dom = new JSDOM(content, { contentType: "application/xhtml+xml" }); + const body = dom.window.document.body; + const text = body?.textContent || ""; + return normalizeTextForComparison(text); +} + +function extractNormalizedTextFromHtml(html: string): string { + const dom = new JSDOM(html); + const body = dom.window.document.body; + const text = body?.textContent || ""; + return normalizeTextForComparison(text); +} + +function countWords(text: string): number { + if (!text) return 0; + return text.split(" ").filter(Boolean).length; +} + +export function assertSeConversionTextCoverage( + sourceFiles: { filename: string; content: string }[], + convertedHtml: string, + options: { minRatio?: number; maxRatio?: number; minWords?: number; sampleWords?: number } = {}, +): void { + const minRatio = options.minRatio ?? 0.995; + const maxRatio = options.maxRatio ?? 1.1; + const minWords = options.minWords ?? 200; + const sampleWords = options.sampleWords ?? 24; + + const sourceText = normalizeTextForComparison( + sourceFiles.map((file) => extractNormalizedTextFromXhtml(file.content)).join(" "), + ); + const outputText = extractNormalizedTextFromHtml(convertedHtml); + + const sourceWords = countWords(sourceText); + const outputWords = countWords(outputText); + + if (sourceWords < minWords) { + return; + } + + if (outputWords === 0) { + throw new Error( + `SE conversion text check failed: output words 0 vs input ${sourceWords} (ratio 0.000).`, + ); + } + + const ratio = outputWords / Math.max(1, sourceWords); + if (ratio >= minRatio && ratio <= maxRatio) return; + + const tokens = sourceText.split(" ").filter(Boolean); + const headSample = tokens.slice(0, sampleWords).join(" "); + const tailSample = tokens.slice(-sampleWords).join(" "); + const headFound = headSample ? outputText.includes(headSample) : true; + const tailFound = tailSample ? outputText.includes(tailSample) : true; + + const headSnippet = headSample.slice(0, 120); + const tailSnippet = tailSample.slice(0, 120); + + throw new Error( + `SE conversion text check failed: output words ${outputWords} vs input ${sourceWords} (ratio ${ratio.toFixed(3)}). ` + + `headFound=${headFound}, tailFound=${tailFound}. ` + + `headSample="${headSnippet}..." tailSample="${tailSnippet}..."`, + ); +} + function slugify(name: string): string { return name .toLowerCase() @@ -301,7 +374,8 @@ function convertDramaTablesToPlayFormat(doc: Document): void { } // eslint-disable-next-line complexity -function extractChaptersFromFile( +function extractChaptersFromElement( + container: Element, file: { filename: string; content: string }, startChapter: number, ): { @@ -309,22 +383,15 @@ function extractChaptersFromFile( htmlParts: string[]; nextChapter: number; } { - const dom = new JSDOM(file.content, { contentType: "application/xhtml+xml" }); - const doc = dom.window.document; - const body = doc.querySelector("body"); - - if (!body) return { chapters: [], htmlParts: [], nextChapter: startChapter }; - - const article = body.querySelector("article, section") || body; - const chapterFormat = detectChapterFormat(article as Element); + const chapterFormat = detectChapterFormat(container); if (chapterFormat === "play") { - convertDramaTablesToPlayFormat(doc); + convertDramaTablesToPlayFormat(container.ownerDocument); } else { - annotateDramaTables(doc); + annotateDramaTables(container.ownerDocument); } - const allNestedSections = article.querySelectorAll( + const allNestedSections = container.querySelectorAll( ":scope > section[data-epub-type], :scope > section[epub\\:type]", ); const nestedChapterSections = Array.from(allNestedSections).filter((section) => { @@ -338,13 +405,13 @@ function extractChaptersFromFile( let chapterCounter = startChapter; if (nestedChapterSections.length > 0) { - const mainTitleEl = article.querySelector( + const mainTitleEl = container.querySelector( ":scope > h1, :scope > h2, :scope > header h1, :scope > header h2", ); const mainTitle = mainTitleEl ? extractTextContent(mainTitleEl) : null; const preambleNodes: Node[] = []; - for (const child of Array.from(article.childNodes)) { + for (const child of Array.from(container.childNodes)) { if (child.nodeType === 1 && (child as Element).tagName?.toLowerCase() === "section") break; preambleNodes.push(child); } @@ -392,14 +459,14 @@ function extractChaptersFromFile( chapterCounter++; } } else { - const titleEl = article.querySelector("h1, h2, header h1, header h2"); + const titleEl = container.querySelector("h1, h2, header h1, header h2"); const title = titleEl ? extractTextContent(titleEl) : file.filename.replace(".xhtml", ""); - const innerHTML = htmlToValidXml(article.innerHTML); + const innerHTML = htmlToValidXml(container.innerHTML); // Preserve article/section-level epub:type for CSS targeting (dedication, epigraph, etc.) - const articleEpubType = - article.getAttribute("epub:type") || article.getAttribute("data-epub-type") || ""; - const epubTypeAttr = articleEpubType ? ` data-epub-type="${articleEpubType}"` : ""; + const containerEpubType = + container.getAttribute("epub:type") || container.getAttribute("data-epub-type") || ""; + const epubTypeAttr = containerEpubType ? ` data-epub-type="${containerEpubType}"` : ""; const formatAttr = chapterFormat !== "prose" ? ` data-chapter-format="${chapterFormat}"` : ""; htmlParts.push( `
\n${innerHTML}\n
`, @@ -408,7 +475,7 @@ function extractChaptersFromFile( chapters.push({ number: chapterCounter, title: escapeXml(title), - content: escapeXml(extractTextContent(article).substring(0, 500)), + content: escapeXml(extractTextContent(container).substring(0, 500)), }); chapterCounter++; } @@ -416,6 +483,41 @@ function extractChaptersFromFile( return { chapters, htmlParts, nextChapter: chapterCounter }; } +function extractChaptersFromFile( + file: { filename: string; content: string }, + startChapter: number, +): { + chapters: { number: number; title: string; content: string }[]; + htmlParts: string[]; + nextChapter: number; +} { + const dom = new JSDOM(file.content, { contentType: "application/xhtml+xml" }); + const doc = dom.window.document; + const body = doc.querySelector("body"); + + if (!body) return { chapters: [], htmlParts: [], nextChapter: startChapter }; + + const topLevelElements = Array.from(body.children).filter((child) => { + const tag = child.tagName?.toLowerCase(); + return tag === "article" || tag === "section"; + }); + + const containers = topLevelElements.length > 0 ? topLevelElements : [body]; + + let chapterCounter = startChapter; + const chapters: { number: number; title: string; content: string }[] = []; + const htmlParts: string[] = []; + + for (const container of containers) { + const result = extractChaptersFromElement(container, file, chapterCounter); + chapters.push(...result.chapters); + htmlParts.push(...result.htmlParts); + chapterCounter = result.nextChapter; + } + + return { chapters, htmlParts, nextChapter: chapterCounter }; +} + export function convertSeXhtmlToHtml( xhtmlFiles: { filename: string; content: string }[], options: { figuresBasePath?: string } = {}, @@ -471,7 +573,7 @@ export function convertSeXhtmlToHtml( ) .join("\n")}\n`; - return { textHtml, chaptersXml, lastChapter: chapterCounter - 1, images }; + return { textHtml, chaptersXml, lastChapter: chapterCounter - 1, images, notes: [] }; } export async function convertSEBook( @@ -512,7 +614,28 @@ export async function convertSEBook( content: fs.readFileSync(path.join(textDir, filename), "utf-8"), })); - return convertSeXhtmlToHtml(xhtmlFiles, options); + const result = convertSeXhtmlToHtml(xhtmlFiles, options); + + // SE notes are typically stored in endnotes.xhtml, with occasional notes.xhtml variants. + const noteFiles = ["endnotes.xhtml", "notes.xhtml"]; + const notesById = new Map(); + for (const noteFilename of noteFiles) { + const notePath = path.join(textDir, noteFilename); + if (!fs.existsSync(notePath)) continue; + + const extracted = extractSENotesFromXhtml(fs.readFileSync(notePath, "utf-8")); + for (const note of extracted) { + if (!notesById.has(note.noteId)) { + notesById.set(note.noteId, note.content); + } + } + } + + result.notes = Array.from(notesById.entries()).map(([noteId, content]) => ({ noteId, content })); + result.textHtml = rewriteSeNoteRefsToDataNote(result.textHtml); + + assertSeConversionTextCoverage(xhtmlFiles, result.textHtml); + return result; } /** @@ -542,6 +665,14 @@ export async function convertAndSaveSEBook(bookSlug: string): Promise { const richXml = wrapInRichXml(result.textHtml); fs.writeFileSync(path.join(inputDir, "rich.xml"), richXml, "utf8"); + const seNotesPath = path.join(inputDir, "se-notes.json"); + if (result.notes.length > 0) { + fs.writeFileSync(seNotesPath, JSON.stringify(result.notes, null, 2), "utf8"); + console.log(`[SE Converter] ${bookSlug} wrote ${result.notes.length} notes to ${seNotesPath}`); + } else if (fs.existsSync(seNotesPath)) { + fs.unlinkSync(seNotesPath); + } + console.log(`[SE Converter] ${bookSlug} saved to ${inputDir}/rich.xml`); } diff --git a/apps/pipeline/src/tools/se-converter/notes.test.ts b/apps/pipeline/src/tools/se-converter/notes.test.ts new file mode 100644 index 00000000..001d9f1a --- /dev/null +++ b/apps/pipeline/src/tools/se-converter/notes.test.ts @@ -0,0 +1,142 @@ +import { describe, expect, it } from "vitest"; +import { convertSEBook } from "./index"; + +import { extractSENotesFromXhtml, rewriteSeNoteRefsToDataNote } from "./notes"; + +describe("SE notes helpers", () => { + describe("extractSENotesFromXhtml", () => { + it("extracts endnote li blocks as fn-style note IDs", () => { + const input = ` + + +
+
    +
  1. First note.

  2. +
  3. Second note.

  4. +
+
+ +`; + + const notes = extractSENotesFromXhtml(input); + + expect(notes).toHaveLength(2); + expect(notes[0]).toEqual({ noteId: "fn1", content: "

First note.

" }); + expect(notes[1].noteId).toBe("fn2"); + expect(notes[1].content).toContain("Second note"); + }); + + it("removes backlink anchors and backlink-only paragraphs", () => { + const input = ` + + +
+
    +
  1. +

    Main note text.

    +

    +
  2. +
+
+ +`; + + const notes = extractSENotesFromXhtml(input); + + expect(notes).toHaveLength(1); + expect(notes[0].noteId).toBe("fn7"); + expect(notes[0].content).toContain("Main note text"); + expect(notes[0].content).not.toContain("↩"); + expect(notes[0].content).not.toContain("backlink"); + }); + + it("ignores list items that are not note-N IDs", () => { + const input = ` + + +
+
    +
  1. Not a note.

  2. +
  3. Valid note.

  4. +
+
+ +`; + + const notes = extractSENotesFromXhtml(input); + + expect(notes).toHaveLength(1); + expect(notes[0]).toEqual({ noteId: "fn3", content: "

Valid note.

" }); + }); + }); + + describe("rewriteSeNoteRefsToDataNote", () => { + it("rewrites endnotes.xhtml#note-N links to data-note", () => { + const input = ` +
+

Text 12

+
+ `; + + const output = rewriteSeNoteRefsToDataNote(input); + + expect(output).toContain('data-note="12"'); + expect(output).not.toContain('href="endnotes.xhtml#note-12"'); + expect(output).toContain('class="link-note"'); + expect(output).not.toContain('id="noteref-12"'); + expect(output).not.toContain('data-epub-type="noteref"'); + }); + + it("rewrites notes.xhtml#note-N links as well", () => { + const input = ` +
+

Text 2

+
+ `; + + const output = rewriteSeNoteRefsToDataNote(input); + + expect(output).toContain('data-note="2"'); + expect(output).not.toContain('href="notes.xhtml#note-2"'); + }); + + it("does not rewrite non-endnote links", () => { + const input = ` +
+

Keep note

+
+ `; + + const output = rewriteSeNoteRefsToDataNote(input); + + expect(output).toContain('href="appendix-2.xhtml#appendix-2-3"'); + expect(output).not.toContain("data-note="); + }); + + it("rewrites multiple note references in one paragraph", () => { + const input = ` +
+

Text1 more2

+
+ `; + + const output = rewriteSeNoteRefsToDataNote(input); + + expect(output).toContain('data-note="1"'); + expect(output).toContain('data-note="2"'); + expect(output).not.toContain('href="endnotes.xhtml#note-1"'); + expect(output).not.toContain('href="endnotes.xhtml#note-2"'); + }); + }); + + describe("integration with SE converter", () => { + it("extracts notes and rewrites note hrefs for sample SE book", async () => { + const result = await convertSEBook("abu-al-ala-al-maarri_the-luzumiyat_ameen-rihani"); + + expect(result.notes.length).toBeGreaterThan(0); + expect(result.notes.some((n) => n.noteId === "fn1")).toBe(true); + expect(result.textHtml).toContain('data-note="1"'); + expect(result.textHtml).not.toContain('href="endnotes.xhtml#note-1"'); + }); + }); +}); diff --git a/apps/pipeline/src/tools/se-converter/notes.ts b/apps/pipeline/src/tools/se-converter/notes.ts new file mode 100644 index 00000000..cfa47a53 --- /dev/null +++ b/apps/pipeline/src/tools/se-converter/notes.ts @@ -0,0 +1,85 @@ +import { JSDOM } from "jsdom"; + +export interface SENote { + noteId: string; + content: string; +} + +const SE_NOTE_HREF_REGEX = /(?:^|\/)(?:endnotes|notes)\.xhtml#note-(\d+)$/i; + +function removeBacklinks(container: Element): void { + const backlinkAnchors = container.querySelectorAll( + 'a[epub\\:type="backlink"], a[data-epub-type="backlink"]', + ); + + for (const anchor of Array.from(backlinkAnchors)) { + const parent = anchor.parentElement; + anchor.remove(); + + if (parent && parent.tagName.toLowerCase() === "p" && !(parent.textContent || "").trim()) { + parent.remove(); + } + } +} + +function cleanupNoteContent(html: string): string { + return html + .replace(/\s+xmlns(:[a-z0-9]+)?="[^"]*"/gi, "") + .replace(/\s+(ns\d+|epub):type="([^"]*)"/gi, ' data-epub-type="$2"') + .replace(/\s+(ns\d+|epub):[a-z-]+="[^"]*"/gi, "") + .trim(); +} + +export function extractSENotesFromXhtml(xhtml: string): SENote[] { + const dom = new JSDOM(xhtml, { contentType: "application/xhtml+xml" }); + const doc = dom.window.document; + + const notes: SENote[] = []; + const seen = new Set(); + const items = doc.querySelectorAll('li[id^="note-"]'); + + for (const item of Array.from(items)) { + const rawId = item.getAttribute("id") || ""; + const match = rawId.match(/^note-(\d+)$/); + if (!match) continue; + + const noteId = `fn${match[1]}`; + if (seen.has(noteId)) continue; + + const clone = item.cloneNode(true) as Element; + removeBacklinks(clone); + + const content = cleanupNoteContent(clone.innerHTML); + if (!content) continue; + + notes.push({ noteId, content }); + seen.add(noteId); + } + + return notes; +} + +export function rewriteSeNoteRefsToDataNote(html: string): string { + const dom = new JSDOM(html); + const doc = dom.window.document; + + const anchors = doc.querySelectorAll("a[href]"); + for (const anchor of Array.from(anchors)) { + const href = anchor.getAttribute("href") || ""; + const match = href.match(SE_NOTE_HREF_REGEX); + if (!match) continue; + + const noteNumber = match[1]; + anchor.setAttribute("data-note", noteNumber); + anchor.removeAttribute("href"); + anchor.removeAttribute("id"); + anchor.removeAttribute("data-epub-type"); + anchor.removeAttribute("epub:type"); + + const classes = new Set((anchor.getAttribute("class") || "").split(/\s+/).filter(Boolean)); + classes.add("link-note"); + anchor.setAttribute("class", Array.from(classes).join(" ")); + } + + return dom.serialize(); +} diff --git a/apps/pipeline/src/tools/standardebooks-queue.ts b/apps/pipeline/src/tools/standardebooks-queue.ts new file mode 100644 index 00000000..7a3dfcbf --- /dev/null +++ b/apps/pipeline/src/tools/standardebooks-queue.ts @@ -0,0 +1,281 @@ +#!/usr/bin/env bun +import path from "path"; +import fs from "fs"; +import yargs from "yargs"; +import { hideBin } from "yargs/helpers"; +import dotenv from "dotenv"; +import { convertAndSaveSEBook } from "./se-converter/index"; +import { startPipeline } from "../server/pipeline"; +import { classifyDramaBooks, type BookAnalysis } from "./se-converter/drama-classifier"; + +dotenv.config({ path: path.resolve(import.meta.dir, "..", "..", ".env") }); + +const PIPELINE_ROOT = path.resolve(import.meta.dir, "..", ".."); +const REPO_ROOT = path.resolve(PIPELINE_ROOT, "..", ".."); +const CONVEX_ASSETS_DIR = path.join(REPO_ROOT, "ConvexAssets", "books"); +const INDEX_PATH = path.join(PIPELINE_ROOT, "standardebooks-data", "index.json"); +const QUEUE_PATH = path.join(PIPELINE_ROOT, "standardebooks-data", "queue.json"); +const BOOKS_DATA_DIR = path.join(PIPELINE_ROOT, "books-data"); + +const EMBEDDED_DRAMA_WHITELIST = [ + "a-a-milne_the-house-at-pooh-corner", + "james-joyce_ulysses", + "f-scott-fitzgerald_the-beautiful-and-damned", + "george-eliot_daniel-deronda", + "ann-radcliffe_the-mysteries-of-udolpho", + "dorothy-l-sayers_clouds-of-witness", + "dorothy-l-sayers_unnatural-death", + "anna-katharine-green_the-leavenworth-case", + "richard-hughes_a-high-wind-in-jamaica", + "william-faulkner_soldiers-pay", + "william-wells-brown_clotel", +]; + +type QueueStatus = "queued" | "running" | "done" | "failed" | "skipped"; + +interface QueueItem { + slug: string; + status: QueueStatus; + attempts: number; + lastError?: string; + updatedAt: string; +} + +interface QueueFile { + meta: { + createdAt: string; + updatedAt: string; + total: number; + embeddedWhitelist: string[]; + excluded: { fullPlay: number; embeddedDrama: number; alreadyExists: number }; + includedEmbedded: number; + }; + items: QueueItem[]; +} + +function readQueue(): QueueFile { + if (!fs.existsSync(QUEUE_PATH)) { + throw new Error(`Queue file not found: ${QUEUE_PATH}`); + } + return JSON.parse(fs.readFileSync(QUEUE_PATH, "utf-8")) as QueueFile; +} + +function writeQueue(queue: QueueFile) { + queue.meta.updatedAt = new Date().toISOString(); + fs.writeFileSync(QUEUE_PATH, JSON.stringify(queue, null, 2)); +} + +function summarizeQueue(queue: QueueFile) { + const counts: Record = { + queued: 0, + running: 0, + done: 0, + failed: 0, + skipped: 0, + }; + for (const item of queue.items) { + counts[item.status] += 1; + } + console.log(`Queue: ${queue.items.length} items`); + console.log(` queued: ${counts.queued}`); + console.log(` running: ${counts.running}`); + console.log(` done: ${counts.done}`); + console.log(` failed: ${counts.failed}`); + console.log(` skipped: ${counts.skipped}`); +} + +function sleep(ms: number) { + return new Promise((resolve) => setTimeout(resolve, ms)); +} + +async function waitForJob(job: { status: string; logs?: string[]; error?: string }) { + let lastLogIndex = 0; + while (true) { + const logs = job.logs || []; + if (logs.length > lastLogIndex) { + for (let i = lastLogIndex; i < logs.length; i += 1) { + console.log(logs[i]); + } + lastLogIndex = logs.length; + } + + if (job.status === "done") return { status: "done" as const }; + if (job.status === "error") return { status: "error" as const, error: job.error }; + + await sleep(1000); + } +} + +async function buildQueue() { + if (!fs.existsSync(INDEX_PATH)) { + throw new Error(`index.json not found: ${INDEX_PATH}`); + } + + const index = JSON.parse(fs.readFileSync(INDEX_PATH, "utf-8")) as { books: { slug: string }[] }; + const slugs = index.books.map((b) => b.slug); + + const analyses = await classifyDramaBooks(slugs); + const analysisBySlug = new Map(); + for (const analysis of analyses) { + analysisBySlug.set(analysis.slug, analysis); + } + + const now = new Date().toISOString(); + const items: QueueItem[] = []; + let excludedFull = 0; + let excludedEmbedded = 0; + let excludedExisting = 0; + let includedEmbedded = 0; + + for (const slug of slugs) { + const existingDir = path.join(BOOKS_DATA_DIR, slug); + const convexMirrorDir = path.join(CONVEX_ASSETS_DIR, slug); + if (fs.existsSync(existingDir) || fs.existsSync(convexMirrorDir)) { + excludedExisting += 1; + continue; + } + + const analysis = analysisBySlug.get(slug); + if (analysis?.category === "FULL_PLAY") { + excludedFull += 1; + continue; + } + + if (analysis?.category === "EMBEDDED_DRAMA") { + if (!EMBEDDED_DRAMA_WHITELIST.includes(slug)) { + excludedEmbedded += 1; + continue; + } + includedEmbedded += 1; + } + + items.push({ slug, status: "queued", attempts: 0, updatedAt: now }); + } + + const queue: QueueFile = { + meta: { + createdAt: now, + updatedAt: now, + total: items.length, + embeddedWhitelist: [...EMBEDDED_DRAMA_WHITELIST], + excluded: { + fullPlay: excludedFull, + embeddedDrama: excludedEmbedded, + alreadyExists: excludedExisting, + }, + includedEmbedded, + }, + items, + }; + + writeQueue(queue); + + console.log(`Queue written to ${QUEUE_PATH}`); + summarizeQueue(queue); + console.log(`Excluded: full_play=${excludedFull}, embedded_drama=${excludedEmbedded}`); + console.log(`Skipped (already exists): ${excludedExisting}`); + console.log(`Included embedded drama (whitelist): ${includedEmbedded}`); +} + +async function runQueue(limit?: number) { + const queue = readQueue(); + let consecutiveFailures = 0; + let processed = 0; + + for (const item of queue.items) { + if (item.status === "done" || item.status === "skipped" || item.status === "failed") { + continue; + } + + if (limit !== undefined && processed >= limit) { + console.log(`Reached limit ${limit}. Stopping.`); + break; + } + + const existingDir = path.join(BOOKS_DATA_DIR, item.slug); + const convexMirrorDir = path.join(CONVEX_ASSETS_DIR, item.slug); + if (fs.existsSync(existingDir) || fs.existsSync(convexMirrorDir)) { + item.status = "skipped"; + item.updatedAt = new Date().toISOString(); + writeQueue(queue); + console.log(`Skipped (already exists in books-data or ConvexAssets).`); + continue; + } + + item.status = "running"; + item.attempts += 1; + item.lastError = undefined; + item.updatedAt = new Date().toISOString(); + writeQueue(queue); + + console.log(`\n=== Processing ${item.slug} ===`); + + try { + await convertAndSaveSEBook(item.slug); + const job = await startPipeline({ slug: item.slug }); + const result = await waitForJob(job); + + if (result.status === "done") { + item.status = "done"; + item.updatedAt = new Date().toISOString(); + writeQueue(queue); + console.log(`✔ Completed ${item.slug}`); + consecutiveFailures = 0; + } else { + item.status = "failed"; + item.lastError = result.error || "Unknown error"; + item.updatedAt = new Date().toISOString(); + writeQueue(queue); + console.log(`✖ Failed ${item.slug}: ${item.lastError}`); + consecutiveFailures += 1; + } + } catch (err) { + item.status = "failed"; + item.lastError = err instanceof Error ? err.message : String(err); + item.updatedAt = new Date().toISOString(); + writeQueue(queue); + console.log(`✖ Failed ${item.slug}: ${item.lastError}`); + consecutiveFailures += 1; + } + + processed += 1; + + if (consecutiveFailures >= 3) { + console.log("Stopping after 3 consecutive failures."); + break; + } + } + + console.log("\nQueue run complete."); + summarizeQueue(queue); +} + +async function main() { + const argv = await yargs(hideBin(process.argv)) + .scriptName("standardebooks-queue") + .command("build", "Build queue from standardebooks-data", {}, async () => { + await buildQueue(); + }) + .command( + "run", + "Run queue (one book at a time)", + (y) => y.option("limit", { type: "number", describe: "Max items to process" }), + async (args) => { + await runQueue(args.limit); + }, + ) + .command("status", "Show queue status", {}, () => { + const queue = readQueue(); + summarizeQueue(queue); + }) + .demandCommand(1) + .help() + .parse(); + + return argv; +} + +main().catch((err) => { + console.error(err); + process.exit(1); +}); diff --git a/apps/pipeline/tsconfig.json b/apps/pipeline/tsconfig.json index a017353a..b35939fb 100644 --- a/apps/pipeline/tsconfig.json +++ b/apps/pipeline/tsconfig.json @@ -1,7 +1,7 @@ { "compilerOptions": { "target": "ES2022", - "lib": ["ES2022"], + "lib": ["ES2022", "DOM"], "module": "ES2022", "moduleResolution": "bundler", "types": ["bun-types", "jest"], diff --git a/apps/platform/package.json b/apps/platform/package.json index 063991bf..9ba99e54 100644 --- a/apps/platform/package.json +++ b/apps/platform/package.json @@ -51,6 +51,7 @@ "@sentry/react": "^10.18.0", "@supabase/supabase-js": "^2.55.0", "@tanstack/react-query": "^5.83.0", + "@trpc/client": "^11.9.0", "class-variance-authority": "^0.7.1", "clsx": "^2.1.1", "cmdk": "^1.1.1", diff --git a/apps/platform/src/App.tsx b/apps/platform/src/App.tsx index f32c5236..b83ad640 100644 --- a/apps/platform/src/App.tsx +++ b/apps/platform/src/App.tsx @@ -6,6 +6,7 @@ import { Toaster } from "@platform/components/ui/toaster"; import { Toaster as Sonner } from "@platform/components/ui/sonner"; import { TooltipProvider } from "@platform/components/ui/tooltip"; import Index from "./pages/Index"; +import { StandardEbooksPage } from "./pages/StandardEbooksPage"; import NotFound from "./pages/NotFound"; import Terms from "./pages/Terms"; import AuthComponentsWrapper from "./pages/AuthComponentsWrapper"; @@ -45,6 +46,7 @@ const AppWithAuth = () => { } /> + } /> } /> } /> } /> diff --git a/apps/platform/src/components/standard-ebooks/StandardEbooksBookCard.tsx b/apps/platform/src/components/standard-ebooks/StandardEbooksBookCard.tsx new file mode 100644 index 00000000..cda50f3a --- /dev/null +++ b/apps/platform/src/components/standard-ebooks/StandardEbooksBookCard.tsx @@ -0,0 +1,261 @@ +import { useState, useEffect, useCallback, useRef } from "react"; +import { BookOpen, Play, Clock } from "lucide-react"; +import { Button } from "@platform/components/ui/button"; +import { Badge } from "@platform/components/ui/badge"; + +export type CollectionBook = { + title: string; + author: string; + slug: string; + cover: string; + coverThumb: string; + coverColor: string; + epoch: string; + genre: string; + kind: string; + hasAudio: boolean; + generatedDescription?: string; + generatedHook?: string; + readingTime?: string; +}; + +export interface BookCardProps { + book: CollectionBook; + onSelect: (slug: string) => void; + onOpenModal?: (book: CollectionBook) => void; + index: number; + totalColumns: number; +} + +// eslint-disable-next-line complexity +export function BookCard({ book, onSelect, onOpenModal, index, totalColumns }: BookCardProps) { + const [isHovered, setIsHovered] = useState(false); + const [isExpanded, setIsExpanded] = useState(false); + const hoverTimeoutRef = useRef(null); + const cardRef = useRef(null); + + const isRightSide = index % totalColumns >= totalColumns / 2; + + const handleMouseEnter = useCallback(() => { + setIsHovered(true); + hoverTimeoutRef.current = window.setTimeout(() => { + setIsExpanded(true); + }, 500); + }, []); + + const handleMouseLeave = useCallback(() => { + setIsHovered(false); + if (hoverTimeoutRef.current) { + clearTimeout(hoverTimeoutRef.current); + hoverTimeoutRef.current = null; + } + setIsExpanded(false); + }, []); + + useEffect(() => { + return () => { + if (hoverTimeoutRef.current) { + clearTimeout(hoverTimeoutRef.current); + } + }; + }, []); + + const handleStartClick = useCallback( + (e: React.MouseEvent) => { + e.stopPropagation(); + onSelect(book.slug); + }, + [book.slug, onSelect], + ); + + const handleCardClick = useCallback(() => { + if (onOpenModal) { + onOpenModal(book); + } else if (!isExpanded) { + onSelect(book.slug); + } + }, [isExpanded, book, onSelect, onOpenModal]); + + const panelWidth = 275; + + const coverElement = ( +
+
+ {book.cover ? ( + {book.title} + ) : ( +
+ +
+ )} +
+
+
+ ); + + const infoElement = ( +
+
+
+

+ {book.title} +

+

{book.author}

+ {book.readingTime && ( +
+ + {book.readingTime} +
+ )} +
+ + {book.generatedHook && ( +
+
+

+ "{book.generatedHook}" +

+
+
+ )} + +
+
+ {book.epoch && ( + + {book.epoch} + + )} + {book.genre && ( + + {book.genre} + + )} + {book.kind && ( + + {book.kind} + + )} +
+ + +
+
+
+ ); + + const descElement = book.generatedDescription ? ( +
+
+

{book.generatedDescription}

+
+
+ ) : null; + + return ( +
+
+
+ {isRightSide ? ( + <> + {descElement} + {infoElement} + {coverElement} + + ) : ( + <> + {coverElement} + {infoElement} + {descElement} + + )} +
+ +
+

+ {book.title} +

+

+ {book.author} +

+
+
+
+ ); +} diff --git a/apps/platform/src/components/standard-ebooks/StandardEbooksBookModal.tsx b/apps/platform/src/components/standard-ebooks/StandardEbooksBookModal.tsx new file mode 100644 index 00000000..05a22d5a --- /dev/null +++ b/apps/platform/src/components/standard-ebooks/StandardEbooksBookModal.tsx @@ -0,0 +1,191 @@ +/* eslint-disable react-hooks/set-state-in-effect */ +import { useEffect, useCallback, useState } from "react"; +import { X, BookOpen, Play } from "lucide-react"; +import { Button } from "@platform/components/ui/button"; +import { Badge } from "@platform/components/ui/badge"; +import type { CollectionBook } from "./StandardEbooksBookCard"; + +interface BookModalProps { + book: CollectionBook | null; + onClose: () => void; + onSelect: (slug: string) => void; +} + +export function BookModal({ book, onClose, onSelect }: BookModalProps) { + const [isClosing, setIsClosing] = useState(false); + const [isVisible, setIsVisible] = useState(false); + const [displayedBook, setDisplayedBook] = useState(null); + + useEffect(() => { + if (book) { + setDisplayedBook(book); + setIsClosing(false); + setIsVisible(false); + requestAnimationFrame(() => { + requestAnimationFrame(() => { + setIsVisible(true); + }); + }); + } + }, [book]); + + const triggerClose = useCallback(() => { + setIsClosing(true); + setTimeout(() => { + onClose(); + setDisplayedBook(null); + }, 150); + }, [onClose]); + + const handleKeyDown = useCallback( + (e: KeyboardEvent) => { + if (e.key === "Escape") triggerClose(); + }, + [triggerClose], + ); + + useEffect(() => { + if (book) { + document.addEventListener("keydown", handleKeyDown); + document.body.style.overflow = "hidden"; + } + return () => { + document.removeEventListener("keydown", handleKeyDown); + document.body.style.overflow = ""; + }; + }, [book, handleKeyDown]); + + const handleBackdropClick = useCallback( + (e: React.MouseEvent) => { + if (e.target === e.currentTarget) triggerClose(); + }, + [triggerClose], + ); + + const handleSelect = useCallback(() => { + if (displayedBook) { + onSelect(displayedBook.slug); + triggerClose(); + } + }, [displayedBook, onSelect, triggerClose]); + + if (!book && !displayedBook) return null; + const shownBook = displayedBook ?? book; + if (!shownBook) return null; + + const currentBook = shownBook; + + const coverWidth = 340; + const coverHeight = 510; + const panelWidth = 320; + + return ( +
+ + +
+ {currentBook.generatedDescription && ( +
+

+ {currentBook.generatedDescription} +

+
+ )} + +
+
+

+ {currentBook.title} +

+

{currentBook.author}

+
+ + {currentBook.generatedHook && ( +
+
+

+ "{currentBook.generatedHook}" +

+
+
+ )} + +
+
+ {currentBook.epoch && ( + + {currentBook.epoch} + + )} + {currentBook.genre && ( + + {currentBook.genre} + + )} + {currentBook.kind && ( + + {currentBook.kind} + + )} +
+ + +
+
+ +
+ {currentBook.cover ? ( + {currentBook.title} + ) : ( +
+ +
+ )} +
+
+
+ ); +} diff --git a/apps/platform/src/lib/standardEbooksTrpc.ts b/apps/platform/src/lib/standardEbooksTrpc.ts new file mode 100644 index 00000000..0a07a7c6 --- /dev/null +++ b/apps/platform/src/lib/standardEbooksTrpc.ts @@ -0,0 +1,8 @@ +import { createTRPCProxyClient, httpBatchLink } from "@trpc/client"; +// import type { AppRouter } from "@pipeline/src/server/router"; + +// should set to AppRouter but that causes type issues. we will move the data fetching to convex anyway soon. +// eslint-disable-next-line @typescript-eslint/no-explicit-any +export const standardEbooksTrpc = createTRPCProxyClient({ + links: [httpBatchLink({ url: "http://localhost:4000/trpc" })], +}); diff --git a/apps/platform/src/pages/StandardEbooksPage.tsx b/apps/platform/src/pages/StandardEbooksPage.tsx new file mode 100644 index 00000000..db9a21a1 --- /dev/null +++ b/apps/platform/src/pages/StandardEbooksPage.tsx @@ -0,0 +1,303 @@ +import { useState, useEffect, useCallback, useRef, useLayoutEffect, useMemo } from "react"; +import { useNavigate } from "react-router-dom"; +import { Loader2, ArrowLeft, BookOpen, Search, X } from "lucide-react"; +import { Button } from "@platform/components/ui/button"; +import { standardEbooksTrpc } from "@platform/lib/standardEbooksTrpc"; +import { + BookCard, + type CollectionBook, +} from "@platform/components/standard-ebooks/StandardEbooksBookCard"; +import { BookModal } from "@platform/components/standard-ebooks/StandardEbooksBookModal"; +import { useRouteTransition } from "@platform/providers/RouteTransitionProvider"; +import { SPLASH_FADE_DURATION_MS } from "@player/components/SplashScreen"; + +type SEBook = { + slug: string; + title: string; + author: string; + authorFileAs: string; + description: string; + wordCount: number; + language: string; + subjects: string[]; + generatedDescription?: string; + generatedHook?: string; +}; + +function formatReadingTime(wordCount: number): string { + const minutes = Math.ceil(wordCount / 250); + if (minutes < 60) return "~1 hr"; + const hours = Math.round(minutes / 60); + return `~${hours} hrs`; +} + +function seBookToCollectionBook(book: SEBook): CollectionBook { + return { + title: book.title, + author: book.author, + slug: book.slug, + cover: `http://localhost:4000/se-cover/${book.slug}`, + coverThumb: `http://localhost:4000/se-cover/${book.slug}`, + coverColor: "#2a2a3d", + epoch: "", + genre: book.subjects[0] || "", + kind: "", + hasAudio: false, + generatedDescription: book.generatedDescription || book.description, + generatedHook: book.generatedHook || "", + readingTime: formatReadingTime(book.wordCount), + }; +} + +function AuthorLetterRow({ + letter, + books, + onSelectBook, + onOpenModal, +}: { + letter: string; + books: SEBook[]; + onSelectBook: (slug: string) => void; + onOpenModal: (book: CollectionBook) => void; +}) { + const [isVisible, setIsVisible] = useState(false); + const containerRef = useRef(null); + const scrollRef = useRef(null); + const scrollPosRef = useRef(0); + + useEffect(() => { + const observer = new IntersectionObserver( + ([entry]) => { + if (entry.isIntersecting) { + setIsVisible(true); + } else { + if (scrollRef.current) { + scrollPosRef.current = scrollRef.current.scrollLeft; + } + setIsVisible(false); + } + }, + { rootMargin: "600px" }, + ); + + if (containerRef.current) { + observer.observe(containerRef.current); + } + + return () => observer.disconnect(); + }, []); + + useLayoutEffect(() => { + if (isVisible && scrollRef.current && scrollPosRef.current > 0) { + scrollRef.current.scrollLeft = scrollPosRef.current; + } + }, [isVisible, books]); + + if (!isVisible) { + return
; + } + + const collectionBooks = books.map(seBookToCollectionBook); + + return ( +
+
+

{letter}

+ {books.length} books +
+ +
+
+ {collectionBooks.map((book, i) => ( +
+ +
+ ))} +
+
+
+ ); +} + +export function StandardEbooksPage() { + const navigate = useNavigate(); + const { startTransition, setNavigatedFromPlatform } = useRouteTransition(); + const [groupedBooks, setGroupedBooks] = useState>({}); + const [allBooks, setAllBooks] = useState([]); + const [isLoading, setIsLoading] = useState(true); + const [totalBooks, setTotalBooks] = useState(0); + const [modalBook, setModalBook] = useState(null); + const [searchQuery, setSearchQuery] = useState(""); + + useEffect(() => { + const loadData = async () => { + try { + // @ts-expect-error - incorrect typing somehow + const data = await standardEbooksTrpc.getStandardEbooksIndex.query(); + setGroupedBooks(data.groupedByAuthorLetter); + setAllBooks(data.books); + setTotalBooks(data.books.length); + } catch (e) { + console.error("Failed to load Standard Ebooks index:", e); + } finally { + setIsLoading(false); + } + }; + loadData(); + }, []); + + const bookBySlug = useMemo(() => new Map(allBooks.map((book) => [book.slug, book])), [allBooks]); + + const handleBookSelect = useCallback( + (slug: string) => { + const book = bookBySlug.get(slug); + const title = book?.title ?? "BookGenius"; + const author = book?.author ?? ""; + const phrases = book?.generatedHook ? [book.generatedHook] : []; + + setNavigatedFromPlatform(true); + startTransition({ title, phrases, author, showStartButton: false }); + + setTimeout(() => { + navigate(`/reader?book=${encodeURIComponent(slug)}`, { + state: { meta: { title, phrases, author } }, + }); + }, SPLASH_FADE_DURATION_MS); + }, + [bookBySlug, navigate, setNavigatedFromPlatform, startTransition], + ); + + const handleOpenModal = useCallback((book: CollectionBook) => { + setModalBook(book); + }, []); + + const handleCloseModal = useCallback(() => { + setModalBook(null); + }, []); + + if (isLoading) { + return ( +
+ +
+ ); + } + + const normalizedQuery = searchQuery.trim().toLowerCase(); + const filteredBooks = normalizedQuery + ? allBooks.filter((book) => { + const haystack = [ + book.title, + book.author, + book.authorFileAs, + book.description, + book.subjects.join(" "), + ] + .join(" ") + .toLowerCase(); + return haystack.includes(normalizedQuery); + }) + : allBooks; + + const visibleGroupedBooks = normalizedQuery + ? filteredBooks.reduce>((acc, book) => { + const firstLetter = (book.authorFileAs || book.author).charAt(0).toUpperCase(); + if (!acc[firstLetter]) acc[firstLetter] = []; + acc[firstLetter].push(book); + return acc; + }, {}) + : groupedBooks; + + const sortedLetters = Object.keys(visibleGroupedBooks).sort(); + const visibleCount = normalizedQuery ? filteredBooks.length : totalBooks; + + return ( +
+
+
+

+ + Standard Ebooks +

+

+ {visibleCount} professionally formatted public domain books +

+
+ +
+ +
+
+
+ + setSearchQuery(e.target.value)} + placeholder="Search by title, author, or subject..." + className="w-full pl-10 pr-10 py-2 rounded-md bg-background border border-border text-foreground placeholder:text-muted-foreground focus:outline-none focus:ring-2 focus:ring-primary/50" + /> + {searchQuery.trim() && ( + + )} +
+
+ {normalizedQuery + ? `Showing ${visibleCount} of ${totalBooks} books` + : `${totalBooks} books`} +
+
+
+ +
+ {sortedLetters.length === 0 ? ( +
+ No books match your search. +
+ ) : ( + sortedLetters.map((letter, idx) => ( +
+ +
+ )) + )} +
+ + +
+ ); +} diff --git a/apps/player/src/components/modals/BookChaptersModal.tsx b/apps/player/src/components/modals/BookChaptersModal.tsx index ffc27cd5..56e471cd 100644 --- a/apps/player/src/components/modals/BookChaptersModal.tsx +++ b/apps/player/src/components/modals/BookChaptersModal.tsx @@ -3,7 +3,6 @@ import { useTranslation } from "react-i18next"; import ModalUI from "@player/components/modals/ModalUI"; import { systemNavigateTo } from "@player/helpers/paragraphsNavigation"; -import { getChapterTitle } from "@player/utils/getChapterTitle"; import { Button } from "../ui/button"; import { useBookConvex } from "@player/context/BookConvexContext"; import { useLocationRange } from "@player/hooks/useLocationRange"; @@ -34,12 +33,12 @@ const BookChaptersModal: React.FC = ({ onClose }) => { return bookData.chapters.map((chapter, index) => { return { id: parseInt(chapter.id), - title: getChapterTitle(parseInt(chapter.id), t), + title: chapter.title, page: (index + 1).toString(), isLocked: hasDemoAccess && parseInt(chapter.id, 10) > maxDemoChapter, }; }); - }, [t, hasDemoAccess, bookData]); + }, [hasDemoAccess, bookData]); const navigateToChapter = (chapterId: number) => { systemNavigateTo({ currentChapter: chapterId, currentParagraph: 0 }); diff --git a/apps/player/src/components/modals/CharacterModal.tsx b/apps/player/src/components/modals/CharacterModal.tsx index f48d5d5f..3e6f643f 100644 --- a/apps/player/src/components/modals/CharacterModal.tsx +++ b/apps/player/src/components/modals/CharacterModal.tsx @@ -11,10 +11,10 @@ import { getSavedLocation, systemNavigateTo } from "@player/helpers/paragraphsNa import { useBookConvex } from "@player/context/BookConvexContext"; import { highlightSearchInParagraph } from "@player/utils/textHighlighting"; import { DialogEnhanceClose } from "../ui/dialog"; -import { getChapterTitle } from "@player/utils/getChapterTitle"; import { resolveCharacterSnapshot } from "@player/utils/characterOverrides"; import { isVideoFile } from "@player/helpers/isVideoFile"; import { getAvatarSource } from "@player/helpers/svgAvatars"; +import { slugToDisplayName } from "@player/helpers/minorCharacterUtils"; import { useBottomInput } from "@player/stores/modals/bottomInput.store"; import { useSearchModal } from "@player/stores/modals/searchModal.store"; import { FILTER_OPTIONS } from "@player/utils/filterOptions"; @@ -51,12 +51,21 @@ const CharacterModal: React.FC = ({ const { setValue } = useBottomInput(); const { openModal: openSearchModal, setLastClickedAppearanceId, setResults } = useSearchModal(); const { pauseAllTimers, showAllElements } = useElementVisibilityStore(); - const { charactersData } = useBookConvex(); + const { charactersData, bookData } = useBookConvex(); + const chapterTitle = useMemo( + () => bookData!.chapters.find((c) => c.id === String(chapter))?.title, + [chapter, bookData], + ); const matchingCharacter = useMemo( () => charactersData.find((c) => c.slug === characterSlug), [characterSlug, charactersData], ); + // Generic avatar for unknown/minor characters (if available) + const genericCharacter = useMemo( + () => charactersData.find((c) => c.slug === "generic-avatar"), + [charactersData], + ); const latestSummary = useMemo( () => (matchingCharacter ? findLatestSummaryInRange(matchingCharacter, endChapter) : ""), [matchingCharacter, endChapter], @@ -176,7 +185,45 @@ const CharacterModal: React.FC = ({ return option ? t(option.translationKey) : type; }; - if (!matchingCharacter) return null; + // Handle unknown characters (speakers not in characterMetadata) + if (!matchingCharacter) { + const displayName = slugToDisplayName(characterSlug); + // Use generic avatar from Convex if available, otherwise SVG fallback + const unknownAvatarSrc = + genericCharacter?.media?.avatarUrl ?? + getAvatarSource({ + slug: characterSlug, + characterName: displayName, + bookSlug: "", + infoPerChapter: [], + }); + + return ( + + +
+ {displayName} +
+

{displayName}

+ { + e.preventDefault(); + e.stopPropagation(); + handleOnClose(); + }} + /> +
+
+ ); + } if (!resolvedMediaSrc) { console.error("no resolved media src for character modal", matchingCharacter); @@ -294,8 +341,7 @@ const CharacterModal: React.FC = ({ - {appearance.percentInChapter}% {t("of_chapter")}{" "} - {getChapterTitle(appearance.chapter, t)} + {appearance.percentInChapter}% {t("of_chapter")} {chapterTitle}
diff --git a/apps/player/src/components/modals/ImageZoomModal.tsx b/apps/player/src/components/modals/ImageZoomModal.tsx new file mode 100644 index 00000000..1907709c --- /dev/null +++ b/apps/player/src/components/modals/ImageZoomModal.tsx @@ -0,0 +1,266 @@ +import React, { useCallback, useEffect, useRef } from "react"; +import { X } from "lucide-react"; + +import { Dialog, DialogContent, DialogTitle } from "@player/components/ui/dialog"; +import { cn } from "@player/lib/utils"; + +interface ImageZoomModalProps { + src: string; + alt?: string; + onClose: () => void; +} + +type PointerPosition = { x: number; y: number }; + +const MIN_SCALE = 1; +const MAX_SCALE = 5; + +const clamp = (value: number, min: number, max: number) => Math.min(max, Math.max(min, value)); + +const getDistance = (a: PointerPosition, b: PointerPosition) => { + const dx = a.x - b.x; + const dy = a.y - b.y; + return Math.hypot(dx, dy); +}; + +const getMidpoint = (a: PointerPosition, b: PointerPosition) => ({ + x: (a.x + b.x) / 2, + y: (a.y + b.y) / 2, +}); + +const ImageZoomModal: React.FC = ({ src, alt, onClose }) => { + const containerRef = useRef(null); + const imgRef = useRef(null); + const baseSizeRef = useRef<{ width: number; height: number } | null>(null); + const pointersRef = useRef>(new Map()); + const lastPanRef = useRef(null); + const pinchRef = useRef<{ + startDistance: number; + startScale: number; + startX: number; + startY: number; + startMid: PointerPosition; + } | null>(null); + const transformRef = useRef({ scale: 1, x: 0, y: 0 }); + const rafRef = useRef(null); + + const applyTransform = useCallback(() => { + if (rafRef.current !== null) return; + rafRef.current = window.requestAnimationFrame(() => { + rafRef.current = null; + const img = imgRef.current; + if (!img) return; + const { scale, x, y } = transformRef.current; + img.style.transform = `translate(${x}px, ${y}px) scale(${scale})`; + }); + }, []); + + const clampTranslate = useCallback(() => { + const container = containerRef.current; + const base = baseSizeRef.current; + if (!container || !base) return; + + const rect = container.getBoundingClientRect(); + const { scale } = transformRef.current; + + if (scale <= 1) { + transformRef.current.x = 0; + transformRef.current.y = 0; + return; + } + + const maxOffsetX = Math.max(0, (base.width * scale - rect.width) / 2); + const maxOffsetY = Math.max(0, (base.height * scale - rect.height) / 2); + + transformRef.current.x = clamp(transformRef.current.x, -maxOffsetX, maxOffsetX); + transformRef.current.y = clamp(transformRef.current.y, -maxOffsetY, maxOffsetY); + }, []); + + const resetTransform = useCallback(() => { + transformRef.current = { scale: 1, x: 0, y: 0 }; + if (imgRef.current) { + imgRef.current.style.transform = "translate(0px, 0px) scale(1)"; + } + }, []); + + const handleImageLoad = useCallback(() => { + resetTransform(); + if (imgRef.current) { + const rect = imgRef.current.getBoundingClientRect(); + baseSizeRef.current = { width: rect.width, height: rect.height }; + } + }, [resetTransform]); + + const handlePointerDown = useCallback((event: React.PointerEvent) => { + if (event.button !== 0) return; + const container = containerRef.current; + if (!container) return; + + container.setPointerCapture(event.pointerId); + pointersRef.current.set(event.pointerId, { x: event.clientX, y: event.clientY }); + + if (pointersRef.current.size === 1) { + lastPanRef.current = { x: event.clientX, y: event.clientY }; + pinchRef.current = null; + } else if (pointersRef.current.size === 2) { + const [p1, p2] = Array.from(pointersRef.current.values()); + pinchRef.current = { + startDistance: getDistance(p1, p2), + startScale: transformRef.current.scale, + startX: transformRef.current.x, + startY: transformRef.current.y, + startMid: getMidpoint(p1, p2), + }; + lastPanRef.current = null; + } + }, []); + + const handlePointerMove = useCallback( + (event: React.PointerEvent) => { + if (!pointersRef.current.has(event.pointerId)) return; + + pointersRef.current.set(event.pointerId, { x: event.clientX, y: event.clientY }); + + if (pointersRef.current.size === 1) { + if (transformRef.current.scale <= 1) return; + const last = lastPanRef.current; + if (!last) return; + const dx = event.clientX - last.x; + const dy = event.clientY - last.y; + transformRef.current.x += dx; + transformRef.current.y += dy; + lastPanRef.current = { x: event.clientX, y: event.clientY }; + clampTranslate(); + applyTransform(); + return; + } + + if (pointersRef.current.size >= 2 && pinchRef.current) { + const [p1, p2] = Array.from(pointersRef.current.values()); + const dist = getDistance(p1, p2); + const nextScale = clamp( + pinchRef.current.startScale * (dist / pinchRef.current.startDistance), + MIN_SCALE, + MAX_SCALE, + ); + const mid = getMidpoint(p1, p2); + const dx = mid.x - pinchRef.current.startMid.x; + const dy = mid.y - pinchRef.current.startMid.y; + transformRef.current.scale = nextScale; + transformRef.current.x = pinchRef.current.startX + dx; + transformRef.current.y = pinchRef.current.startY + dy; + clampTranslate(); + applyTransform(); + } + }, + [applyTransform, clampTranslate], + ); + + const handlePointerUp = useCallback( + (event: React.PointerEvent) => { + if (!pointersRef.current.has(event.pointerId)) return; + pointersRef.current.delete(event.pointerId); + if (pointersRef.current.size < 2) { + pinchRef.current = null; + } + if (pointersRef.current.size === 1) { + const [remaining] = Array.from(pointersRef.current.values()); + lastPanRef.current = remaining ?? null; + } else { + lastPanRef.current = null; + } + + if (transformRef.current.scale <= 1) { + transformRef.current = { scale: 1, x: 0, y: 0 }; + applyTransform(); + } + }, + [applyTransform], + ); + + useEffect(() => { + const container = containerRef.current; + if (!container) return; + + const handleWheel = (event: WheelEvent) => { + event.preventDefault(); + const { scale, x, y } = transformRef.current; + const nextScale = clamp(scale * (1 - event.deltaY * 0.0015), MIN_SCALE, MAX_SCALE); + if (nextScale === scale) return; + + const rect = container.getBoundingClientRect(); + const pointer = { + x: event.clientX - rect.left - rect.width / 2, + y: event.clientY - rect.top - rect.height / 2, + }; + const scaleRatio = nextScale / scale; + transformRef.current.scale = nextScale; + transformRef.current.x = x + (1 - scaleRatio) * pointer.x; + transformRef.current.y = y + (1 - scaleRatio) * pointer.y; + + clampTranslate(); + applyTransform(); + }; + + container.addEventListener("wheel", handleWheel, { passive: false }); + return () => { + container.removeEventListener("wheel", handleWheel); + }; + }, [applyTransform, clampTranslate]); + + useEffect(() => { + resetTransform(); + }, [src, resetTransform]); + + return ( + !open && onClose()} modal> + {alt || "Image"} + { + // Prevent dragging the dialog itself + e.preventDefault(); + }} + > + + +
+ {alt +
+
+
+ ); +}; + +export default ImageZoomModal; diff --git a/apps/player/src/components/modals/SearchModal.tsx b/apps/player/src/components/modals/SearchModal.tsx index 5eb75cf1..a2fb499f 100644 --- a/apps/player/src/components/modals/SearchModal.tsx +++ b/apps/player/src/components/modals/SearchModal.tsx @@ -8,7 +8,6 @@ import React, { useRef, } from "react"; import { useTranslation } from "react-i18next"; -import type { TFunction } from "i18next"; import { motion } from "motion/react"; import { Search, FileText, Minimize2, Maximize2, X } from "lucide-react"; @@ -29,10 +28,10 @@ import { AccordionTrigger, AccordionContent, } from "@player/components/ui/accordion"; -import { getChapterTitle } from "@player/utils/getChapterTitle"; import { cn } from "@player/lib/utils"; import { findScrollParent } from "@player/utils/findScrollParent"; import { FILTER_OPTIONS, FILTER_VALUE_MAP, type SearchFilter } from "@player/utils/filterOptions"; +import { useBookConvex } from "@player/context/BookConvexContext"; interface SearchModalProps { onClose: () => void; @@ -51,6 +50,7 @@ export const SearchModal: React.FC = ({ // eslint-disable-next-line complexity -- search UI with filtering, loading states, and result rendering }) => { const { t } = useTranslation(); + const { bookData } = useBookConvex(); const deferredResults = useDeferredValue(searchResults); @@ -368,8 +368,10 @@ export const SearchModal: React.FC = ({ c.id === String(chapter))?.title ?? "" + } items={items} - t={t} clickedAppearanceId={clickedAppearanceId} searchQuery={searchQuery} /> @@ -457,28 +459,27 @@ export const SearchModal: React.FC = ({ const ChapterGroup = memo(function ChapterGroup({ chapter, + chapterTitle, items, - t, clickedAppearanceId, searchQuery, }: { chapter: number; + chapterTitle: string; items: SearchResultItemData[]; - t: TFunction; clickedAppearanceId?: string; searchQuery?: string; }) { - const chapterTitle = useMemo( + const chapterTitleElement = useMemo( () => (
- {getChapterTitle(Number(chapter), t)} ({items.length}{" "} - {items.length === 1 ? "result" : "results"}) + {chapterTitle} ({items.length} {items.length === 1 ? "result" : "results"})
), - [chapter, items.length, t], + [chapterTitle, items.length], ); return ( @@ -487,7 +488,7 @@ const ChapterGroup = memo(function ChapterGroup({ className="border-book-primary-20 rounded-lg mb-3 overflow-hidden" > - {chapterTitle} + {chapterTitleElement}
diff --git a/apps/player/src/context/BookConvexContext.tsx b/apps/player/src/context/BookConvexContext.tsx index 4ad1626f..ea1c24e0 100644 --- a/apps/player/src/context/BookConvexContext.tsx +++ b/apps/player/src/context/BookConvexContext.tsx @@ -362,6 +362,7 @@ export function BookConvexProvider({ bookPath, children }: BookConvexProviderPro const characters = useMemo(() => { if (!charactersQuery) return []; + console.log(`charactersQuery`, charactersQuery); return charactersQuery.map((c) => ({ path: c.path, slug: c.slug, diff --git a/apps/player/src/features/ModalRenderers.tsx b/apps/player/src/features/ModalRenderers.tsx index c7628cb8..0920a86f 100644 --- a/apps/player/src/features/ModalRenderers.tsx +++ b/apps/player/src/features/ModalRenderers.tsx @@ -15,6 +15,7 @@ import { MusicEditModalRenderer } from "./modals/musicEdit/MusicEditModalRendere import { MusicAddModalRenderer } from "./modals/musicAdd/MusicAddModalRenderer"; import { NoteEditModalRenderer } from "./modals/noteEdit/NoteEditModalRenderer"; import { GraphicsSettingsModalRenderer } from "./modals/graphicsSettings/GraphicsSettingsModalRenderer"; +import { ImageZoomModalRenderer } from "./modals/imageZoom/ImageZoomModalRenderer"; export const ModalRenderers: React.FC = () => { return ( @@ -35,6 +36,7 @@ export const ModalRenderers: React.FC = () => { + ); }; diff --git a/apps/player/src/features/modals/imageZoom/ImageZoomModalRenderer.tsx b/apps/player/src/features/modals/imageZoom/ImageZoomModalRenderer.tsx new file mode 100644 index 00000000..d61c7042 --- /dev/null +++ b/apps/player/src/features/modals/imageZoom/ImageZoomModalRenderer.tsx @@ -0,0 +1,20 @@ +import React from "react"; +import { createPortal } from "react-dom"; +import { AnimatePresence } from "motion/react"; + +import { useImageModal } from "@player/stores/modals/imageModal.store"; +import { useEscapeKey } from "@player/hooks/useEscapeKey"; +import ImageZoomModal from "@player/components/modals/ImageZoomModal"; + +export const ImageZoomModalRenderer: React.FC = () => { + const { isOpen, src, alt, closeModal } = useImageModal(); + + useEscapeKey(isOpen, closeModal); + + return createPortal( + + {isOpen && src ? : null} + , + document.body, + ); +}; diff --git a/apps/player/src/helpers/minorCharacterUtils.ts b/apps/player/src/helpers/minorCharacterUtils.ts new file mode 100644 index 00000000..b1e13229 --- /dev/null +++ b/apps/player/src/helpers/minorCharacterUtils.ts @@ -0,0 +1,24 @@ +/** + * Utilities for handling unknown/minor characters that are not in characterMetadata. + * Unknown characters are identified by speakers that have a data-speaker attribute + * but don't match any known character slug in the book's character list. + */ + +/** + * Check if a character slug is for an unknown/minor character + * by checking if it exists in the set of known character slugs. + */ +export function isUnknownCharacter(slug: string, knownSlugs: Set): boolean { + return !knownSlugs.has(slug); +} + +/** + * Convert a slug to a human-readable display name. + * "tall-soldier-at-gate" -> "Tall Soldier At Gate" + */ +export function slugToDisplayName(slug: string): string { + return slug + .split("-") + .map((word) => word.charAt(0).toUpperCase() + word.slice(1)) + .join(" "); +} diff --git a/apps/player/src/hooks/useBookContent.ts b/apps/player/src/hooks/useBookContent.ts index 9a04e11e..5d917901 100644 --- a/apps/player/src/hooks/useBookContent.ts +++ b/apps/player/src/hooks/useBookContent.ts @@ -8,6 +8,7 @@ import { replaceXmlTagsIntoHtmlTags } from "@player/helpers/replaceXmlTagsIntoHt import { activateCharacterInteractions } from "@player/helpers/activateCharacterInteractions"; import { activateFootnoteInteractions } from "@player/helpers/activateFootnoteInteractions"; import { useBookConvex } from "@player/context/BookConvexContext"; +import { useImageModal } from "@player/stores/modals/imageModal.store"; import { markLayoutUnstable, LAYOUT_UNSTABLE_VIRTUALIZER_MS, @@ -48,6 +49,7 @@ export function useBookContent() { const { currentChapter } = location; const { isPlayFormat } = useBookForm(); const { openModal: openCharacterDetailsModal } = useCharacterModal(); + const { openModal: openImageModal } = useImageModal(); // Initialize to -1 so the first real version (0 or 1) is always detected as a change // Using textVersion as initial value would miss the first update if component mounts after version change @@ -77,6 +79,21 @@ export function useBookContent() { const target = event.target as HTMLElement; + const imageTarget = target instanceof HTMLImageElement ? target : target.closest("img"); + if (imageTarget) { + const isInlineAvatar = imageTarget.closest(".inline-avatar"); + const isInChapter = imageTarget.closest("section[data-chapter]"); + if (!isInlineAvatar && isInChapter) { + const src = imageTarget.currentSrc || imageTarget.src; + if (src) { + event.preventDefault(); + event.stopPropagation(); + openImageModal({ src, alt: imageTarget.alt || undefined }); + return; + } + } + } + const isInlineAvatar = target.closest(".inline-avatar"); const isCharacterHighlighted = target.classList.contains("character-highlighted-activated"); const isCharacterPlaceholder = target.closest(".character-placeholder"); @@ -163,7 +180,7 @@ export function useBookContent() { activateFootnoteInteractions(complexitySpan); setSentenceAsClicked(currentSentenceId); }, - [openCharacterDetailsModal, isPlayFormat], + [openCharacterDetailsModal, openImageModal, isPlayFormat], ); useEffect(() => { diff --git a/apps/player/src/locales/en/translation.json b/apps/player/src/locales/en/translation.json index 135a1831..fe724cde 100644 --- a/apps/player/src/locales/en/translation.json +++ b/apps/player/src/locales/en/translation.json @@ -237,7 +237,7 @@ "chapters": "Chapters", "paragraph": "Paragraph", "chapter_percent": "Chapter Percent", - "of_chapter": "of Chapter", + "of_chapter": "of", "book_settings": "Book Settings", "open_chapter": "Open Chapter", "back_to_platform": "Back to Platform", diff --git a/apps/player/src/locales/pl/translation.json b/apps/player/src/locales/pl/translation.json index 2e16f837..b1e3b3a6 100644 --- a/apps/player/src/locales/pl/translation.json +++ b/apps/player/src/locales/pl/translation.json @@ -255,7 +255,7 @@ "chapters": "Rozdziały", "paragraph": "Paragraf", "chapter_percent": "Procent w Rozdziale", - "of_chapter": "rozdziału", + "of_chapter": "w", "book_settings": "Ustawienia Książki", "open_chapter": "Otwórz Rozdział", "back_to_platform": "Powrót do Platformy", diff --git a/apps/player/src/services/__tests__/formatB.test.ts b/apps/player/src/services/__tests__/formatB.test.ts index cdf9bf88..2e5152a9 100644 --- a/apps/player/src/services/__tests__/formatB.test.ts +++ b/apps/player/src/services/__tests__/formatB.test.ts @@ -180,6 +180,26 @@ describe("Format B", () => { // Pure em paragraphs should become didaskalia expect(result).toContain('data-is-didaskalia="true"'); }); + + it("marks pure em paragraphs inside format B speaker blocks as didaskalia", () => { + const input = `
+
+

To know my deed, 'twere best not know myself.

+

Knocking within

+

Wake Duncan with thy knocking! I would thou couldst!

+
+
`; + + const result = normalizeChapterHtml(input); + const parser = new DOMParser(); + const doc = parser.parseFromString(result, "text/html"); + + const didaskalia = Array.from(doc.querySelectorAll(".character-text p")).find((p) => + p.textContent?.includes("Knocking within"), + ); + + expect(didaskalia?.getAttribute("data-is-didaskalia")).toBe("true"); + }); }); describe("data-index injection", () => { diff --git a/apps/player/src/services/__tests__/paragraphCount.test.ts b/apps/player/src/services/__tests__/paragraphCount.test.ts new file mode 100644 index 00000000..6d04af74 --- /dev/null +++ b/apps/player/src/services/__tests__/paragraphCount.test.ts @@ -0,0 +1,26 @@ +/** + * @vitest-environment jsdom + */ +import { describe, it, expect } from "vitest"; +import { countParagraphsFromChapterHtml } from "../htmlNormalizer"; + +describe("countParagraphsFromChapterHtml", () => { + it("counts data-index in compiled HTML", () => { + const html = + '

A

B

C

'; + expect(countParagraphsFromChapterHtml(html)).toBe(3); + }); + + it("counts normalized prose children", () => { + const html = '

Title

One

Two

'; + expect(countParagraphsFromChapterHtml(html)).toBe(3); + }); + + it("accounts for play rows in poemProse render mode", () => { + const html = `
+

Hello

+

World

+
`; + expect(countParagraphsFromChapterHtml(html, { renderMode: "poemProse" })).toBe(4); + }); +}); diff --git a/apps/player/src/services/htmlNormalizer.ts b/apps/player/src/services/htmlNormalizer.ts index 890e7a35..e42b0146 100644 --- a/apps/player/src/services/htmlNormalizer.ts +++ b/apps/player/src/services/htmlNormalizer.ts @@ -188,10 +188,14 @@ function transformFormatBToPlayRows(section: Element, doc: Document): void { // Move content paragraphs for (const innerChild of Array.from(child.children)) { + const isExplicitDidaskalia = innerChild.getAttribute("data-is-didaskalia") === "true"; + const isPureEm = + innerChild.tagName.toLowerCase() === "p" ? isPureEmParagraph(innerChild) : false; + const isDidaskalia = isExplicitDidaskalia || isPureEm; const p = innerChild.cloneNode(true) as Element; p.setAttribute("data-text-alignment", state.alignment); p.setAttribute("data-is-character", "false"); - p.setAttribute("data-is-didaskalia", "false"); + p.setAttribute("data-is-didaskalia", isDidaskalia ? "true" : "false"); characterText.appendChild(p); } @@ -349,6 +353,8 @@ export type RenderMode = "default" | "enhancedProse" | "poemProse"; export type EnhancedProseOptions = { speakerDisplayNames?: Map }; +export type ParagraphCountOptions = { renderMode?: RenderMode; bookForm?: string | null }; + function createPlayRowFromSpeakerGroup( paragraphs: Element[], doc: Document, @@ -761,6 +767,47 @@ export function normalizeBookHtml(html: string): string { return doc.body.innerHTML; } +function countDataIndexFromHtml(html: string): number { + const parser = new DOMParser(); + const doc = parser.parseFromString(html, "text/html"); + return doc.querySelectorAll("[data-index]").length; +} + +/** + * Count paragraphs the same way the player indexes them (via data-index). + * This ensures calculateReadProgress stays accurate without loading all chapters client-side. + */ +export function countParagraphsFromChapterHtml( + html: string, + options: ParagraphCountOptions = {}, +): number { + if (!html.trim()) { + return 0; + } + + if (typeof DOMParser === "undefined") { + throw new Error("DOMParser is not available. Provide a DOMParser implementation first."); + } + + let normalizedHtml = html; + if (detectSourceFormat(html) === "source") { + const renderMode = options.renderMode ?? "default"; + const bookForm = options.bookForm?.toLowerCase() ?? ""; + const useEnhancedProse = renderMode === "enhancedProse" && bookForm !== "play"; + const usePoemProse = renderMode === "poemProse"; + + if (usePoemProse) { + normalizedHtml = normalizeChapterHtmlPoemProse(html); + } else if (useEnhancedProse) { + normalizedHtml = normalizeChapterHtmlEnhanced(html); + } else { + normalizedHtml = normalizeChapterHtml(html); + } + } + + return countDataIndexFromHtml(normalizedHtml); +} + export interface CharacterOccurrence { slug: string; chapter: number; diff --git a/apps/player/src/stores/modals/imageModal.store.ts b/apps/player/src/stores/modals/imageModal.store.ts new file mode 100644 index 00000000..b2d59f8b --- /dev/null +++ b/apps/player/src/stores/modals/imageModal.store.ts @@ -0,0 +1,44 @@ +import { create } from "zustand"; +import { devtools } from "zustand/middleware"; + +import { useModalCoordinator } from "../modalCoordinator.store"; + +export interface ImageModalParams { + src: string; + alt?: string; +} + +const MODAL_ID = "image-modal"; + +interface ImageModalState { + isOpen: boolean; + src?: string; + alt?: string; + + openModal: (params: ImageModalParams) => void; + closeModal: () => void; +} + +export const useImageModal = create()( + devtools( + (set) => ({ + isOpen: false, + src: undefined, + alt: undefined, + + openModal: ({ src, alt }: ImageModalParams) => { + const coordinator = useModalCoordinator.getState(); + if (coordinator.requestModalOpen(MODAL_ID)) { + set({ isOpen: true, src, alt }); + } + }, + + closeModal: () => { + const coordinator = useModalCoordinator.getState(); + coordinator.releaseModal(MODAL_ID); + set({ isOpen: false, src: undefined, alt: undefined }); + }, + }), + { name: "image-modal" }, + ), +); diff --git a/apps/player/src/styles/se-semantics.css b/apps/player/src/styles/se-semantics.css index a7ece189..d037e1c9 100644 --- a/apps/player/src/styles/se-semantics.css +++ b/apps/player/src/styles/se-semantics.css @@ -290,6 +290,7 @@ margin: auto; max-height: 100vh; max-width: 100%; + cursor: zoom-in; } figure.full-page { diff --git a/apps/player/src/ui/activateMediaInRange.ts b/apps/player/src/ui/activateMediaInRange.ts index cb729042..2d3b0312 100644 --- a/apps/player/src/ui/activateMediaInRange.ts +++ b/apps/player/src/ui/activateMediaInRange.ts @@ -314,6 +314,7 @@ function populateInlineAvatarShell( characterData: CharacterData | undefined, location: { chapter: number; paragraph: number } | null, snapshotOverride?: CharacterSnapshot | null, + genericCharacter?: CharacterData, ): boolean { if (shell.querySelector("img")) { return false; @@ -325,9 +326,38 @@ function populateInlineAvatarShell( return false; } + // Handle unknown characters (not in Convex) if (!characterData) { - console.warn(`[populateInlineAvatarShell] ${characterSlug}: no characterData provided`); - return false; + console.log("generic", genericCharacter); + // Try generic avatar first, then SVG fallback + const genericAvatarUrl = genericCharacter?.media?.avatarUrl; + console.log(`[populateInlineAvatarShell] Unknown character "${characterSlug}":`, { + hasGenericCharacter: !!genericCharacter, + genericCharacterSlug: genericCharacter?.slug, + genericAvatarUrl, + genericMedia: genericCharacter?.media, + }); + const fallbackSrc = genericAvatarUrl ?? generateFallbackAvatarUrl(characterSlug); + + // Generate display name from slug: "other-board-members" -> "Other Board Members" + const displayName = characterSlug.replace(/-/g, " ").replace(/\b\w/g, (c) => c.toUpperCase()); + + shell.title = displayName; + + const placeholderImg = document.createElement("img"); + placeholderImg.src = normalizeSrcForInlineAvatar(fallbackSrc); + placeholderImg.classList.add( + "absolute", + "top-0", + "left-0", + "w-full", + "h-full", + "object-cover", + "rounded-full", + ); + placeholderImg.alt = displayName; + shell.appendChild(placeholderImg); + return true; } const snapshot = @@ -462,6 +492,7 @@ export function activateMediaInRange( ) { const charactersData = getCharactersData(); const charactersBySlug = new Map(charactersData.map((c) => [c.slug, c])); + const genericCharacter = charactersBySlug.get("generic-avatar"); if (isPlayFormat && !isMobile()) { const activeParagraph = document.querySelector(`.active-paragraph`); @@ -570,6 +601,7 @@ export function activateMediaInRange( characterData, locationForPlaceholder, snapshot, + genericCharacter, ); } } else { @@ -680,7 +712,9 @@ export const openPlayRowCharacterModal = ( }; export function hydrateInlineAvatarsInSection(section: HTMLElement): void { + console.log("getCharactersData", getCharactersData()); const charactersBySlug = new Map(getCharactersData().map((c) => [c.slug, c])); + const genericCharacter = charactersBySlug.get("generic-avatar"); const chapterAttr = section.dataset.chapter; const chapterNumber = chapterAttr ? parseInt(chapterAttr, 10) : 0; @@ -694,7 +728,7 @@ export function hydrateInlineAvatarsInSection(section: HTMLElement): void { const paragraphIndex = paragraphEl?.dataset.index ? parseInt(paragraphEl.dataset.index, 10) : 0; const location = { chapter: chapterNumber, paragraph: paragraphIndex }; - populateInlineAvatarShell(shell, characterData, location); + populateInlineAvatarShell(shell, characterData, location, undefined, genericCharacter); }); const personaCells = section.querySelectorAll( diff --git a/apps/player/src/utils/getChapterTitle.ts b/apps/player/src/utils/getChapterTitle.ts deleted file mode 100644 index f5bcfce9..00000000 --- a/apps/player/src/utils/getChapterTitle.ts +++ /dev/null @@ -1,107 +0,0 @@ -import { getBookData } from "@player/state/bookDataStore"; -import { isNumberTitle } from "./isNumberTitle"; - -export const getTitle = (chapter: number, t: (key: string) => string) => { - // Special case for 0 - if (chapter === 0) return t("chapter_zero"); - - // Units (1-9) - const units = [ - "", - t("ordinal.1"), - t("ordinal.2"), - t("ordinal.3"), - t("ordinal.4"), - t("ordinal.5"), - t("ordinal.6"), - t("ordinal.7"), - t("ordinal.8"), - t("ordinal.9"), - ]; - - // Teens (11-19) - const teens = [ - t("ordinal.10"), - t("ordinal.11"), - t("ordinal.12"), - t("ordinal.13"), - t("ordinal.14"), - t("ordinal.15"), - t("ordinal.16"), - t("ordinal.17"), - t("ordinal.18"), - t("ordinal.19"), - ]; - - // Tens (10, 20, 30, etc.) - const tens = [ - "", - t("ordinal.10"), - t("ordinal.20"), - t("ordinal.30"), - t("ordinal.40"), - t("ordinal.50"), - t("ordinal.60"), - t("ordinal.70"), - t("ordinal.80"), - t("ordinal.90"), - ]; - - // Hundreds (100, 200, etc.) - in case they're needed for very large books - const hundreds = [ - "", - t("ordinal.100"), - t("ordinal.200"), - t("ordinal.300"), - t("ordinal.400"), - t("ordinal.500"), - t("ordinal.600"), - t("ordinal.700"), - t("ordinal.800"), - t("ordinal.900"), - ]; - - let chapterName = ""; - - if (chapter >= 100) { - const hundred = Math.floor(chapter / 100); - chapterName += hundreds[hundred] + " "; - chapter %= 100; - } - - if (chapter >= 20) { - const ten = Math.floor(chapter / 10); - const unit = chapter % 10; - chapterName += tens[ten]; - if (unit > 0) { - chapterName += " " + units[unit]; - } - } else if (chapter >= 10) { - chapterName += teens[chapter - 10]; - } else { - chapterName += units[chapter]; - } - - return `${t("chapter")} ${chapterName.trim()}`; -}; - -/** - * Get the display title for a chapter, preferring custom titles over ordinal numbers - * @param chapterNumber - The chapter number (1-based) - * @param t - Translation function - * @returns The chapter title to display - */ -export const getChapterTitle = (chapterNumber: number, t: (key: string) => string): string => { - const bookData = getBookData(); - - if (!bookData?.chapters) { - return `${t("chapter")} ${chapterNumber}`; - } - - const chapter = bookData.chapters.find((ch) => parseInt(ch.id) === chapterNumber); - if (chapter && chapter.title.trim() && !isNumberTitle(chapter.title)) { - return chapter.title; - } - - return getTitle(chapterNumber, t); -}; diff --git a/bun.lock b/bun.lock index 7b8424b2..d9e19503 100644 --- a/bun.lock +++ b/bun.lock @@ -152,12 +152,14 @@ "name": "@bookgenius/pipeline", "version": "1.0.0", "dependencies": { - "@ai-sdk/anthropic": "2.0.38", - "@ai-sdk/cerebras": "^1.0.20", - "@ai-sdk/google": "^2.0.14", - "@ai-sdk/groq": "^2.0.21", - "@ai-sdk/openai": "^2.0.30", - "@ai-sdk/provider": "^2.0.0", + "@ai-sdk/anthropic": "^3.0.36", + "@ai-sdk/azure": "^3.0.26", + "@ai-sdk/cerebras": "^2.0.30", + "@ai-sdk/google": "^3.0.21", + "@ai-sdk/groq": "^3.0.21", + "@ai-sdk/openai": "^3.0.25", + "@ai-sdk/provider": "^3.0.7", + "@ai-sdk/react": "^3.0.73", "@anthropic-ai/claude-agent-sdk": "^0.1.59", "@anthropic-ai/sdk": "^0.39.0", "@arcjet/bun": "^1.0.0-beta.15", @@ -170,7 +172,7 @@ "@google-cloud/vertexai": "^1.9.3", "@google/genai": "^0.9.0", "@google/generative-ai": "^0.24.0", - "@openrouter/ai-sdk-provider": "^1.2.0", + "@openrouter/ai-sdk-provider": "^2.1.1", "@runwayml/sdk": "^2.0.2", "@sentry/node": "^10.11.0", "@trpc/server": "^11.0.0-rc.660", @@ -180,7 +182,7 @@ "@types/multer": "^2.0.0", "@types/ws": "^8.18.0", "@xmldom/xmldom": "^0.9.8", - "ai": "5.0.81", + "ai": "^6.0.71", "async_hooks": "^1.0.0", "axios": "^1.8.1", "chalk": "^5.4.1", @@ -353,6 +355,7 @@ "@sentry/react": "^10.18.0", "@supabase/supabase-js": "^2.55.0", "@tanstack/react-query": "^5.83.0", + "@trpc/client": "^11.9.0", "class-variance-authority": "^0.7.1", "clsx": "^2.1.1", "cmdk": "^1.1.1", @@ -623,23 +626,27 @@ "@adobe/css-tools": ["@adobe/css-tools@4.4.4", "", {}, "sha512-Elp+iwUx5rN5+Y8xLt5/GRoG20WGoDCQ/1Fb+1LiGtvwbDavuSk0jhD/eZdckHAuzcDzccnkv+rEjyWfRx18gg=="], - "@ai-sdk/anthropic": ["@ai-sdk/anthropic@2.0.38", "", { "dependencies": { "@ai-sdk/provider": "2.0.0", "@ai-sdk/provider-utils": "3.0.13" }, "peerDependencies": { "zod": "^3.25.76 || ^4.1.8" } }, "sha512-NjU1ftHbu90OfRCgBwfFelmdEXwGFwLEcfyOyyfjRDm8QHaJUlPNnXhdhPTYuUU386yhj29Vibemiaq6jQv3lA=="], + "@ai-sdk/anthropic": ["@ai-sdk/anthropic@3.0.36", "", { "dependencies": { "@ai-sdk/provider": "3.0.7", "@ai-sdk/provider-utils": "4.0.13" }, "peerDependencies": { "zod": "^3.25.76 || ^4.1.8" } }, "sha512-GHQccfwC0j1JltN9M47RSlBpOyHoUam0mvbYMf8zpE0UD1tzIX5sDw2m/8nRlrTz6wGuKfaDxmoC3XH7uhTrXg=="], - "@ai-sdk/cerebras": ["@ai-sdk/cerebras@1.0.34", "", { "dependencies": { "@ai-sdk/openai-compatible": "1.0.30", "@ai-sdk/provider": "2.0.1", "@ai-sdk/provider-utils": "3.0.20" }, "peerDependencies": { "zod": "^3.25.76 || ^4.1.8" } }, "sha512-XOK0dJsAGoPYi/lfR4KFBi8xhvJ46oCpAxUD6FmJAuJ4eh0qlj5zDt+myvzM8gvN7S6K7zHD+mdWlOPKGQT8Vg=="], + "@ai-sdk/azure": ["@ai-sdk/azure@3.0.26", "", { "dependencies": { "@ai-sdk/openai": "3.0.25", "@ai-sdk/provider": "3.0.7", "@ai-sdk/provider-utils": "4.0.13" }, "peerDependencies": { "zod": "^3.25.76 || ^4.1.8" } }, "sha512-R9v2kuzeu80qErIcXyqM1dpr6/w/iEPdxNAgFPJOFUhk5Zi4XSEBGz+eLYdw1cuzVPfqgMWyFvze9LR7vqz0Gg=="], - "@ai-sdk/gateway": ["@ai-sdk/gateway@2.0.2", "", { "dependencies": { "@ai-sdk/provider": "2.0.0", "@ai-sdk/provider-utils": "3.0.13", "@vercel/oidc": "3.0.3" }, "peerDependencies": { "zod": "^3.25.76 || ^4.1.8" } }, "sha512-25F1qPqZxOw9IcV9OQCL29hV4HAFLw5bFWlzQLBi5aDhEZsTMT2rMi3umSqNaUxrrw+dLRtjOL7RbHC+WjbA/A=="], + "@ai-sdk/cerebras": ["@ai-sdk/cerebras@2.0.30", "", { "dependencies": { "@ai-sdk/openai-compatible": "2.0.27", "@ai-sdk/provider": "3.0.7", "@ai-sdk/provider-utils": "4.0.13" }, "peerDependencies": { "zod": "^3.25.76 || ^4.1.8" } }, "sha512-z6f07Ch95TZCBi6lFGrrXimInXWiN/OTfo6VmIo/lWNcK9qJP4UUPg2KOSk/H6r0xF3o+IJg1bIYKL2eEJdIHA=="], - "@ai-sdk/google": ["@ai-sdk/google@2.0.52", "", { "dependencies": { "@ai-sdk/provider": "2.0.1", "@ai-sdk/provider-utils": "3.0.20" }, "peerDependencies": { "zod": "^3.25.76 || ^4.1.8" } }, "sha512-2XUnGi3f7TV4ujoAhA+Fg3idUoG/+Y2xjCRg70a1/m0DH1KSQqYaCboJ1C19y6ZHGdf5KNT20eJdswP6TvrY2g=="], + "@ai-sdk/gateway": ["@ai-sdk/gateway@3.0.34", "", { "dependencies": { "@ai-sdk/provider": "3.0.7", "@ai-sdk/provider-utils": "4.0.13", "@vercel/oidc": "3.1.0" }, "peerDependencies": { "zod": "^3.25.76 || ^4.1.8" } }, "sha512-ZS1dWai5DINQOv3bpNi3ua9+yt3jnRaux3CwEr/ai9qETp6T+2wcpZyw+0jUHo1He/Lznw//AQh4zhdwHnTWrg=="], - "@ai-sdk/groq": ["@ai-sdk/groq@2.0.34", "", { "dependencies": { "@ai-sdk/provider": "2.0.1", "@ai-sdk/provider-utils": "3.0.20" }, "peerDependencies": { "zod": "^3.25.76 || ^4.1.8" } }, "sha512-wfCYkVgmVjxNA32T57KbLabVnv9aFUflJ4urJ7eWgTwbnmGQHElCTu+rJ3ydxkXSqxOkXPwMOttDm7XNrvPjmg=="], + "@ai-sdk/google": ["@ai-sdk/google@3.0.21", "", { "dependencies": { "@ai-sdk/provider": "3.0.7", "@ai-sdk/provider-utils": "4.0.13" }, "peerDependencies": { "zod": "^3.25.76 || ^4.1.8" } }, "sha512-qQuvcbDqDPZojtoT45UFCQVH2w3m6KJKKjqJduUsvhN5ZqOXste0h4HgHK8hwGuDfv96Jr9QQEpspbgp6iu5Uw=="], - "@ai-sdk/openai": ["@ai-sdk/openai@2.0.89", "", { "dependencies": { "@ai-sdk/provider": "2.0.1", "@ai-sdk/provider-utils": "3.0.20" }, "peerDependencies": { "zod": "^3.25.76 || ^4.1.8" } }, "sha512-4+qWkBCbL9HPKbgrUO/F2uXZ8GqrYxHa8SWEYIzxEJ9zvWw3ISr3t1/27O1i8MGSym+PzEyHBT48EV4LAwWaEw=="], + "@ai-sdk/groq": ["@ai-sdk/groq@3.0.21", "", { "dependencies": { "@ai-sdk/provider": "3.0.7", "@ai-sdk/provider-utils": "4.0.13" }, "peerDependencies": { "zod": "^3.25.76 || ^4.1.8" } }, "sha512-sYTnGbvUNoDKTUa5BBKDvzSnjgahtEWriohdljtGBd4vgcimqY3XMXAqefOXEiYjON/GBFx6Q/YR3GVcX32Mcg=="], - "@ai-sdk/openai-compatible": ["@ai-sdk/openai-compatible@1.0.30", "", { "dependencies": { "@ai-sdk/provider": "2.0.1", "@ai-sdk/provider-utils": "3.0.20" }, "peerDependencies": { "zod": "^3.25.76 || ^4.1.8" } }, "sha512-thubwhRtv9uicAxSWwNpinM7hiL/0CkhL/ymPaHuKvI494J7HIzn8KQZQ2ymRz284WTIZnI7VMyyejxW4RMM6w=="], + "@ai-sdk/openai": ["@ai-sdk/openai@3.0.25", "", { "dependencies": { "@ai-sdk/provider": "3.0.7", "@ai-sdk/provider-utils": "4.0.13" }, "peerDependencies": { "zod": "^3.25.76 || ^4.1.8" } }, "sha512-DsaN46R98+D1W3lU3fKuPU3ofacboLaHlkAwxJPgJ8eup1AJHmPK1N1y10eJJbJcF6iby8Tf/vanoZxc9JPUfw=="], - "@ai-sdk/provider": ["@ai-sdk/provider@2.0.1", "", { "dependencies": { "json-schema": "^0.4.0" } }, "sha512-KCUwswvsC5VsW2PWFqF8eJgSCu5Ysj7m1TxiHTVA6g7k360bk0RNQENT8KTMAYEs+8fWPD3Uu4dEmzGHc+jGng=="], + "@ai-sdk/openai-compatible": ["@ai-sdk/openai-compatible@2.0.27", "", { "dependencies": { "@ai-sdk/provider": "3.0.7", "@ai-sdk/provider-utils": "4.0.13" }, "peerDependencies": { "zod": "^3.25.76 || ^4.1.8" } }, "sha512-YpAZe7OQuMkYqcM/m1BMX0xFn4QdhuL4qGo8sNaiLq1VjEeU/pPfz51rnlpCfCvYanUL5TjIZEbdclBUwLooSQ=="], - "@ai-sdk/provider-utils": ["@ai-sdk/provider-utils@3.0.13", "", { "dependencies": { "@ai-sdk/provider": "2.0.0", "@standard-schema/spec": "^1.0.0", "eventsource-parser": "^3.0.5" }, "peerDependencies": { "zod": "^3.25.76 || ^4.1.8" } }, "sha512-aXFLBLRPTUYA853MJliItefSXeJPl+mg0KSjbToP41kJ+banBmHO8ZPGLJhNqGlCU82o11TYN7G05EREKX8CkA=="], + "@ai-sdk/provider": ["@ai-sdk/provider@3.0.7", "", { "dependencies": { "json-schema": "^0.4.0" } }, "sha512-VkPLrutM6VdA924/mG8OS+5frbVTcu6e046D2bgDo00tehBANR1QBJ/mPcZ9tXMFOsVcm6SQArOregxePzTFPw=="], + + "@ai-sdk/provider-utils": ["@ai-sdk/provider-utils@4.0.13", "", { "dependencies": { "@ai-sdk/provider": "3.0.7", "@standard-schema/spec": "^1.1.0", "eventsource-parser": "^3.0.6" }, "peerDependencies": { "zod": "^3.25.76 || ^4.1.8" } }, "sha512-HHG72BN4d+OWTcq2NwTxOm/2qvk1duYsnhCDtsbYwn/h/4zeqURu1S0+Cn0nY2Ysq9a9HGKvrYuMn9bgFhR2Og=="], + + "@ai-sdk/react": ["@ai-sdk/react@3.0.73", "", { "dependencies": { "@ai-sdk/provider-utils": "4.0.13", "ai": "6.0.71", "swr": "^2.2.5", "throttleit": "2.1.0" }, "peerDependencies": { "react": "^18 || ~19.0.1 || ~19.1.2 || ^19.2.1" } }, "sha512-k7KnGZuTn5HKZWUh2OLxij5Erhedo/6faiO3eb+pD3fIVXwt8sr4V1tU60pBG349Kzrj2EUzg5mn1TYZf1Y2pw=="], "@alcalzone/ansi-tokenize": ["@alcalzone/ansi-tokenize@0.1.3", "", { "dependencies": { "ansi-styles": "^6.2.1", "is-fullwidth-code-point": "^4.0.0" } }, "sha512-3yWxPTq3UQ/FY9p1ErPxIyfT64elWaMvM9lIHnaqpyft63tkxodF5aUElYHrdisWve5cETkh1+KBw1yJuW0aRw=="], @@ -1559,9 +1566,7 @@ "@openai/agents-realtime": ["@openai/agents-realtime@0.1.11", "", { "dependencies": { "@openai/agents-core": "0.1.11", "@types/ws": "^8.18.1", "debug": "^4.4.0", "ws": "^8.18.1" }, "peerDependencies": { "zod": "^3.25.40" } }, "sha512-8jaNuYU1acra28i7bYrZIPubI6s2ziY2ZudqAVK2ad+giopXcrNSiJTuZ2S3z+ESnIejwMiYLfnY2Le8W0SJ7A=="], - "@openrouter/ai-sdk-provider": ["@openrouter/ai-sdk-provider@1.5.4", "", { "dependencies": { "@openrouter/sdk": "^0.1.27" }, "peerDependencies": { "ai": "^5.0.0", "zod": "^3.24.1 || ^v4" } }, "sha512-xrSQPUIH8n9zuyYZR0XK7Ba0h2KsjJcMkxnwaYfmv13pKs3sDkjPzVPPhlhzqBGddHb5cFEwJ9VFuFeDcxCDSw=="], - - "@openrouter/sdk": ["@openrouter/sdk@0.1.27", "", { "dependencies": { "zod": "^3.25.0 || ^4.0.0" } }, "sha512-RH//L10bSmc81q25zAZudiI4kNkLgxF2E+WU42vghp3N6TEvZ6F0jK7uT3tOxkEn91gzmMw9YVmDENy7SJsajQ=="], + "@openrouter/ai-sdk-provider": ["@openrouter/ai-sdk-provider@2.1.1", "", { "peerDependencies": { "ai": "^6.0.0", "zod": "^3.25.0 || ^4.0.0" } }, "sha512-UypPbVnSExxmG/4Zg0usRiit3auvQVrjUXSyEhm0sZ9GQnW/d8p/bKgCk2neh1W5YyRSo7PNQvCrAEBHZnqQkQ=="], "@opentelemetry/api": ["@opentelemetry/api@1.9.0", "", {}, "sha512-3giAOQvZiH5F9bMlMiv8+GSPMeqg0dbaeo58/0SlA9sxSqZhnUtxzX9/2FzyhS9sWQf5S0GJE0AKBrFqjpeYcg=="], @@ -2411,7 +2416,7 @@ "@urql/exchange-retry": ["@urql/exchange-retry@1.3.2", "", { "dependencies": { "@urql/core": "^5.1.2", "wonka": "^6.3.2" } }, "sha512-TQMCz2pFJMfpNxmSfX1VSfTjwUIFx/mL+p1bnfM1xjjdla7Z+KnGMW/EhFbpckp3LyWAH4PgOsMwOMnIN+MBFg=="], - "@vercel/oidc": ["@vercel/oidc@3.0.3", "", {}, "sha512-yNEQvPcVrK9sIe637+I0jD6leluPxzwJKx/Haw6F4H77CdDsszUn5V3o96LPziXkSNE2B83+Z3mjqGKBK/R6Gg=="], + "@vercel/oidc": ["@vercel/oidc@3.1.0", "", {}, "sha512-Fw28YZpRnA3cAHHDlkt7xQHiJ0fcL+NRcIqsocZQUSmbzeIKRpwttJjik5ZGanXP+vlA4SbTg+AbA3bP363l+w=="], "@vitejs/plugin-react": ["@vitejs/plugin-react@4.7.0", "", { "dependencies": { "@babel/core": "^7.28.0", "@babel/plugin-transform-react-jsx-self": "^7.27.1", "@babel/plugin-transform-react-jsx-source": "^7.27.1", "@rolldown/pluginutils": "1.0.0-beta.27", "@types/babel__core": "^7.20.5", "react-refresh": "^0.17.0" }, "peerDependencies": { "vite": "^4.2.0 || ^5.0.0 || ^6.0.0 || ^7.0.0" } }, "sha512-gUu9hwfWvvEDBBmgtAowQCojwZmJ5mcLn3aufeCsitijs3+f2NsrPtlAWIR6OPiqljl96GVCUbLe0HyqIpVaoA=="], @@ -2465,7 +2470,7 @@ "aggregate-error": ["aggregate-error@3.1.0", "", { "dependencies": { "clean-stack": "^2.0.0", "indent-string": "^4.0.0" } }, "sha512-4I7Td01quW/RpocfNayFdFVk1qSuoh0E7JrbRJ16nH01HhKFQ88INq9Sd+nd72zqRySlr9BmDA8xlEJ6vJMrYA=="], - "ai": ["ai@5.0.81", "", { "dependencies": { "@ai-sdk/gateway": "2.0.2", "@ai-sdk/provider": "2.0.0", "@ai-sdk/provider-utils": "3.0.13", "@opentelemetry/api": "1.9.0" }, "peerDependencies": { "zod": "^3.25.76 || ^4.1.8" } }, "sha512-SB7oMC9QSpIu1VLswFTZuhhpfQfrGtFBUbWLtHBkhjWZIQskjtcdEhB+N4yO9hscdc2wYtjw/tacgoxX93QWFw=="], + "ai": ["ai@6.0.71", "", { "dependencies": { "@ai-sdk/gateway": "3.0.34", "@ai-sdk/provider": "3.0.7", "@ai-sdk/provider-utils": "4.0.13", "@opentelemetry/api": "1.9.0" }, "peerDependencies": { "zod": "^3.25.76 || ^4.1.8" } }, "sha512-33/Qq0BhEG+SabYeE7ZlLL3DCubUQo04C8E9UdEHqh2DJuGdtxMXG/TSMkO2uF+ZmlGJpD4UY5Wij9/qal298w=="], "ajv": ["ajv@6.12.6", "", { "dependencies": { "fast-deep-equal": "^3.1.1", "fast-json-stable-stringify": "^2.0.0", "json-schema-traverse": "^0.4.1", "uri-js": "^4.2.2" } }, "sha512-j3fVLgvTo527anyYyJOGTYJbG+vnnQYvE0m5mmkc1TK+nxAppkCLMIL0aZ4dblVCNoGShhm+kzE4ZUykBoMg4g=="], @@ -4851,6 +4856,8 @@ "throat": ["throat@5.0.0", "", {}, "sha512-fcwX4mndzpLQKBS1DVYhGAcYaYt7vsHNIvQV+WXMvnow5cgjPphq5CaayLaGsjRdSCKZFNGt7/GYAuXaNOiYCA=="], + "throttleit": ["throttleit@2.1.0", "", {}, "sha512-nt6AMGKW1p/70DF/hGBdJB57B8Tspmbp5gfJ8ilhLnt7kkr2ye7hzD6NVG8GGErk2HWF34igrL2CXmNIkzKqKw=="], + "through2": ["through2@2.0.5", "", { "dependencies": { "readable-stream": "~2.3.6", "xtend": "~4.0.1" } }, "sha512-/mrRod8xqpA+IHSLyGCQ2s8SPHiCDEeQJSep1jqLYeEUClOFG2Qsh+4FU6G9VeqpZnGW/Su8LQGc4YKni5rYSQ=="], "tiny-invariant": ["tiny-invariant@1.3.3", "", {}, "sha512-+FbBPE1o9QAYvviau/qC5SE3caw21q3xkvWKBtja5vgqOWIHHJ3ioaq1VPfn/Szqctz2bU/oYeKd9/z5BL+PVg=="], @@ -5223,22 +5230,6 @@ "zwitch": ["zwitch@2.0.4", "", {}, "sha512-bXE4cR/kVZhKZX/RjPEflHaKVhUVl85noU3v6b8apfQEc1x4A+zBxjZ4lN8LqGd6WZ3dl98pY4o717VFmoPp+A=="], - "@ai-sdk/anthropic/@ai-sdk/provider": ["@ai-sdk/provider@2.0.0", "", { "dependencies": { "json-schema": "^0.4.0" } }, "sha512-6o7Y2SeO9vFKB8lArHXehNuusnpddKPk7xqL7T2/b+OvXMRIXUO1rR4wcv1hAFUAT9avGZshty3Wlua/XA7TvA=="], - - "@ai-sdk/cerebras/@ai-sdk/provider-utils": ["@ai-sdk/provider-utils@3.0.20", "", { "dependencies": { "@ai-sdk/provider": "2.0.1", "@standard-schema/spec": "^1.0.0", "eventsource-parser": "^3.0.6" }, "peerDependencies": { "zod": "^3.25.76 || ^4.1.8" } }, "sha512-iXHVe0apM2zUEzauqJwqmpC37A5rihrStAih5Ks+JE32iTe4LZ58y17UGBjpQQTCRw9YxMeo2UFLxLpBluyvLQ=="], - - "@ai-sdk/gateway/@ai-sdk/provider": ["@ai-sdk/provider@2.0.0", "", { "dependencies": { "json-schema": "^0.4.0" } }, "sha512-6o7Y2SeO9vFKB8lArHXehNuusnpddKPk7xqL7T2/b+OvXMRIXUO1rR4wcv1hAFUAT9avGZshty3Wlua/XA7TvA=="], - - "@ai-sdk/google/@ai-sdk/provider-utils": ["@ai-sdk/provider-utils@3.0.20", "", { "dependencies": { "@ai-sdk/provider": "2.0.1", "@standard-schema/spec": "^1.0.0", "eventsource-parser": "^3.0.6" }, "peerDependencies": { "zod": "^3.25.76 || ^4.1.8" } }, "sha512-iXHVe0apM2zUEzauqJwqmpC37A5rihrStAih5Ks+JE32iTe4LZ58y17UGBjpQQTCRw9YxMeo2UFLxLpBluyvLQ=="], - - "@ai-sdk/groq/@ai-sdk/provider-utils": ["@ai-sdk/provider-utils@3.0.20", "", { "dependencies": { "@ai-sdk/provider": "2.0.1", "@standard-schema/spec": "^1.0.0", "eventsource-parser": "^3.0.6" }, "peerDependencies": { "zod": "^3.25.76 || ^4.1.8" } }, "sha512-iXHVe0apM2zUEzauqJwqmpC37A5rihrStAih5Ks+JE32iTe4LZ58y17UGBjpQQTCRw9YxMeo2UFLxLpBluyvLQ=="], - - "@ai-sdk/openai/@ai-sdk/provider-utils": ["@ai-sdk/provider-utils@3.0.20", "", { "dependencies": { "@ai-sdk/provider": "2.0.1", "@standard-schema/spec": "^1.0.0", "eventsource-parser": "^3.0.6" }, "peerDependencies": { "zod": "^3.25.76 || ^4.1.8" } }, "sha512-iXHVe0apM2zUEzauqJwqmpC37A5rihrStAih5Ks+JE32iTe4LZ58y17UGBjpQQTCRw9YxMeo2UFLxLpBluyvLQ=="], - - "@ai-sdk/openai-compatible/@ai-sdk/provider-utils": ["@ai-sdk/provider-utils@3.0.20", "", { "dependencies": { "@ai-sdk/provider": "2.0.1", "@standard-schema/spec": "^1.0.0", "eventsource-parser": "^3.0.6" }, "peerDependencies": { "zod": "^3.25.76 || ^4.1.8" } }, "sha512-iXHVe0apM2zUEzauqJwqmpC37A5rihrStAih5Ks+JE32iTe4LZ58y17UGBjpQQTCRw9YxMeo2UFLxLpBluyvLQ=="], - - "@ai-sdk/provider-utils/@ai-sdk/provider": ["@ai-sdk/provider@2.0.0", "", { "dependencies": { "json-schema": "^0.4.0" } }, "sha512-6o7Y2SeO9vFKB8lArHXehNuusnpddKPk7xqL7T2/b+OvXMRIXUO1rR4wcv1hAFUAT9avGZshty3Wlua/XA7TvA=="], - "@alcalzone/ansi-tokenize/ansi-styles": ["ansi-styles@6.2.3", "", {}, "sha512-4Dj6M28JB+oAH8kFkTLUo+a2jwOFkuqb3yucU0CANcRRUbxS0cP0nZYCGjcc3BNXwRIsUVmDGgzawme7zvJHvg=="], "@anthropic-ai/sdk/@types/node": ["@types/node@18.19.130", "", { "dependencies": { "undici-types": "~5.26.4" } }, "sha512-GRaXQx6jGfL8sKfaIDD6OupbIHBr9jv7Jnaml9tB7l4v068PAOXqfcujMMo5PhbIs6ggR1XODELqahT2R8v0fg=="], @@ -5677,8 +5668,6 @@ "aggregate-error/indent-string": ["indent-string@4.0.0", "", {}, "sha512-EdDDZu4A2OyIK7Lr/2zG+w5jmbuk1DVBnEwREQvBzspBJkCEbRa8GxU1lghYcaGJCnRWibjDXlq779X1/y5xwg=="], - "ai/@ai-sdk/provider": ["@ai-sdk/provider@2.0.0", "", { "dependencies": { "json-schema": "^0.4.0" } }, "sha512-6o7Y2SeO9vFKB8lArHXehNuusnpddKPk7xqL7T2/b+OvXMRIXUO1rR4wcv1hAFUAT9avGZshty3Wlua/XA7TvA=="], - "ajv-formats/ajv": ["ajv@8.17.1", "", { "dependencies": { "fast-deep-equal": "^3.1.3", "fast-uri": "^3.0.1", "json-schema-traverse": "^1.0.0", "require-from-string": "^2.0.2" } }, "sha512-B/gBuNg5SiMTrPkC+A2+cW0RszwxYmn6VYxB/inlBStS5nx6xHIt/ehKRhIMhqusl7a8LjQoZnjCs5vhwxOQ1g=="], "anymatch/picomatch": ["picomatch@2.3.1", "", {}, "sha512-JU3teHTNjmE2VCGFzuY8EXzCDVwEqB2a8fsIvwaStHhAWJEeVd1o1QD80CU6+ZdEXXSLbSsuLwJjkCBWqRQUVA=="], @@ -6099,6 +6088,8 @@ "pkg-up/find-up": ["find-up@3.0.0", "", { "dependencies": { "locate-path": "^3.0.0" } }, "sha512-1yD6RmLI1XBfxugvORwlck6f75tYL+iR0jqwsOrOxMZyGYqUuDhJ0l4AXdO1iX/FTs9cBAMEk1gWSEx1kSbylg=="], + "platform/@trpc/client": ["@trpc/client@11.9.0", "", { "peerDependencies": { "@trpc/server": "11.9.0", "typescript": ">=5.7.2" } }, "sha512-3r4RT/GbR263QO+2gCPyrs5fEYaXua3/AzCs+GbWC09X0F+mVkyBpO3GRSDObiNU/N1YB597U7WGW3WA1d1TVw=="], + "platform/date-fns": ["date-fns@3.6.0", "", {}, "sha512-fRHTG8g/Gif+kSh50gaGEdToemgfj74aRX3swtiouboip5JDLAyDE9F11nHMIcvOaXeOC6D7SpNhi7uFyB7Uww=="], "platform/eslint-plugin-react-hooks": ["eslint-plugin-react-hooks@5.2.0", "", { "peerDependencies": { "eslint": "^3.0.0 || ^4.0.0 || ^5.0.0 || ^6.0.0 || ^7.0.0 || ^8.0.0-0 || ^9.0.0" } }, "sha512-+f15FfK64YQwZdJNELETdn5ibXEUQmW1DZL6KXhNnc2heoy/sg9VJJeT7n8TlMWouzWqSWavFkIhHyIbIAEapg=="], diff --git a/convex/characterPromptGeneration.ts b/convex/characterPromptGeneration.ts index 7a81b31c..290b929a 100644 --- a/convex/characterPromptGeneration.ts +++ b/convex/characterPromptGeneration.ts @@ -97,7 +97,7 @@ export const generateCharacterPrompt = internalAction({ } const parser = new DOMParser(); - const doc = parser.parseFromString(xmlResult.content, "text/xml"); + const doc = parser.parseFromString(xmlResult.content, "text/html"); const section = doc.getElementsByTagName("section")[0] as XmlDomElement; if (!section) { diff --git a/convex/generator.ts b/convex/generator.ts index b760a3c6..e8f8d37e 100644 --- a/convex/generator.ts +++ b/convex/generator.ts @@ -5,6 +5,7 @@ import { adminMutation, publicQuery } from "./functions"; const PIPELINE_STEPS = [ "import_epub", "create_settings", + "upload_figures", "generate_reference_cards", "rewrite_paragraphs", "generate_graphical_style", @@ -88,6 +89,7 @@ export const ensureBookStructure = adminMutation({ await createFolder(`${bookPath}/characters-data`); await createFolder(`${bookPath}/backgrounds`); await createFolder(`${bookPath}/music`); + await createFolder(`${bookPath}/figures`); const existingBook = await ctx.db .query("books") diff --git a/convex/lib/characterDataV2.ts b/convex/lib/characterDataV2.ts index d1a67fea..64f41778 100644 --- a/convex/lib/characterDataV2.ts +++ b/convex/lib/characterDataV2.ts @@ -342,6 +342,16 @@ export function mergeV2ToCharacterData( avatar: o.avatar, })), }); + } else { + if (slug === "generic-avatar") { + result.push({ + slug, + characterName: meta.name, + bookSlug, + infoPerChapter: [], + media: meta.media, + }); + } } } diff --git a/convex/paragraphEditor.ts b/convex/paragraphEditor.ts index 74a28bf6..26e75070 100644 --- a/convex/paragraphEditor.ts +++ b/convex/paragraphEditor.ts @@ -241,7 +241,7 @@ export const setParagraphSpeaker = bookAction({ console.log("[setParagraphSpeaker] HTML length:", htmlResult.content.length); const parser = new DOMParser(); - const doc = parser.parseFromString(htmlResult.content, "text/xml"); + const doc = parser.parseFromString(htmlResult.content, "text/html"); const paragraph = findParagraphByIndex(doc, paragraphIndex); console.log( @@ -345,7 +345,7 @@ export const modifyCharacterTag = bookAction({ } const parser = new DOMParser(); - const doc = parser.parseFromString(htmlResult.content, "text/xml"); + const doc = parser.parseFromString(htmlResult.content, "text/html"); const paragraph = findParagraphByIndex(doc, paragraphIndex); if (!paragraph) { @@ -451,7 +451,7 @@ export const wrapTextWithCharacter = bookAction({ } const parser = new DOMParser(); - const doc = parser.parseFromString(htmlResult.content, "text/xml"); + const doc = parser.parseFromString(htmlResult.content, "text/html"); const paragraph = findParagraphByIndex(doc, paragraphIndex); if (!paragraph) { @@ -547,7 +547,7 @@ export const removeNoteFromChapter = bookAction({ } const parser = new DOMParser(); - const doc = parser.parseFromString(htmlResult.content, "text/xml"); + const doc = parser.parseFromString(htmlResult.content, "text/html"); let noteFound = false;