'Of course they were', said the Dormouse; '—well in.'
+
+ 'Of course they were', said the Dormouse; '—well in.'
+
```
## Example 3: Multiple Speakers (Edge Case)
@@ -121,6 +146,30 @@ Identify mentions of the characters within the text.
```
+## Example 4: Unknown Character Speaker
+
+**Characters (JSON):**
+
+```json
+[{ "id": "winston", "name": "Winston", "desc": "Protagonist" }]
+```
+
+**Input HTML:**
+
+```html
+
"Stand back!" shouted the tall soldier at the gate.
+
Winston obeyed silently.
+```
+
+**Output HTML:**
+
+```html
+
"Stand back!" shouted the tall soldier at the gate.
+
Winston obeyed silently.
+```
+
+Note: The soldier gets `data-speaker` with a descriptive slug, but is NOT wrapped in `data-c` because they're not in the Characters List.
+
---
## Important reminder
diff --git a/apps/pipeline/src/tools/NewRewriteParagraphsPromptBookChunked.md b/apps/pipeline/src/tools/NewRewriteParagraphsPromptBookChunked.md
index bea6bbe8..d995eb3e 100644
--- a/apps/pipeline/src/tools/NewRewriteParagraphsPromptBookChunked.md
+++ b/apps/pipeline/src/tools/NewRewriteParagraphsPromptBookChunked.md
@@ -34,6 +34,24 @@ Identify mentions of the characters within the text.
- **Flexibility:** Match names even if they appear in different grammatical cases (e.g., Polish declensions like "Winstona", "Winstonowi") or possessives (English "Winston's") or when referenced by title ("General") - but only if its a clear reference to the character.
- **Structure:** `Mentioned Name`
+## 3. Unknown Character Speakers
+
+When dialogue is spoken by a character **NOT in the Characters List**:
+
+- **Tag their SPEECH ONLY** - add `data-speaker` attribute to the paragraph
+- **DO NOT tag their mentions** - no `data-c` spans for unknown characters
+- **Generate a descriptive slug** based on how the text refers to them or their observable traits
+
+### Slug Guidelines for Unknown Characters:
+
+- Keep descriptions concise but uniquely identifying (2-5 words)
+- Use observable traits: role, appearance, location, action
+- Be specific enough to differentiate similar characters (e.g., two soldiers → `tall-soldier-at-gate` vs `wounded-soldier`)
+
+**Good Examples:** `tall-soldier-at-gate`, `old-woman-selling-bread`, `gruff-innkeeper`, `the-nurse`
+
+**Bad Examples:** `person` (too generic), `speaker` (not descriptive), `character-1` (meaningless), `soldier` (too generic)
+
# Constraints (CRITICAL)
1. **Text Invariance:** The visible text inside the tags must remain **EXACTLY** the same as the input. Do not fix grammar, do not correct spelling, do not remove archaic words.
@@ -63,8 +81,13 @@ Identify mentions of the characters within the text.
**Output HTML:**
```html
-
Książę spojrzał na Sarę, a jego wzrok złagodniał.
-
— Panie mój — wyszeptała Sara — twe słowa są jak światło.
+
+ Książę spojrzał na Sarę, a jego
+ wzrok złagodniał.
+
+
+ — Panie mój — wyszeptała Sara — twe słowa są jak światło.
+
```
## Example 2: English (Quotes & Formatting)
@@ -92,7 +115,9 @@ Identify mentions of the characters within the text.
'But they were in the well,' Alice said to the
Dormouse, ignoring the remark.
-
'Of course they were', said the Dormouse; '—well in.'
+
+ 'Of course they were', said the Dormouse; '—well in.'
+
```
## Example 3: Multiple Speakers (Edge Case)
@@ -121,6 +146,30 @@ Identify mentions of the characters within the text.
```
+## Example 4: Unknown Character Speaker
+
+**Characters (JSON):**
+
+```json
+[{ "id": "winston", "name": "Winston", "desc": "Protagonist" }]
+```
+
+**Input HTML:**
+
+```html
+
"Stand back!" shouted the tall soldier at the gate.
+
Winston obeyed silently.
+```
+
+**Output HTML:**
+
+```html
+
"Stand back!" shouted the tall soldier at the gate.
+
Winston obeyed silently.
+```
+
+Note: The soldier gets `data-speaker` with a descriptive slug, but is NOT wrapped in `data-c` because they're not in the Characters List.
+
---
## Important reminder
diff --git a/apps/pipeline/src/tools/chapterChunker.spec.ts b/apps/pipeline/src/tools/chapterChunker.spec.ts
new file mode 100644
index 00000000..3ccf6672
--- /dev/null
+++ b/apps/pipeline/src/tools/chapterChunker.spec.ts
@@ -0,0 +1,42 @@
+import { describe, it, expect } from "vitest";
+import { buildParagraphXml, type Paragraph } from "./chapterChunker";
+
+describe("buildParagraphXml", () => {
+ it("renders attributes and escapes quotes inside attribute values", () => {
+ const paragraph: Paragraph = {
+ elementType: "p",
+ dataIndex: 1,
+ text: "Hello world",
+ attributes: { "data-title": 'A "quoted" title' },
+ };
+
+ const output = buildParagraphXml(paragraph);
+
+ expect(output).toBe('
`,
- )
- .join("\n");
+ const paragraphsForPage = buildParagraphsForSummary(paragraphsFromChapter);
const prompt = `
## Fiction Book Chapter Summary
@@ -235,17 +227,16 @@ Provide your summary clearly organized according to the structure above, explici
let summary: ScenesSummariesPerChapter;
try {
- summary = (await callSlowGeminiWithThinkingAndSchemaAndParsed(
+ summary = (await callGrokAzureWithSchema(
`${prompt}\n Reply in the language of the book. It's usually Polish or English. Your instructions are in English so you often reply in English, buts its VERY important to reply in Polish when the book is in Polish, and same goes for other languages..`,
ScenesSummariesPerChapterSchema,
)) as ScenesSummariesPerChapter;
} catch (e) {
console.error(`Error for chapter ${chapterNum}`, e);
try {
- summary = (await callClaude(
+ summary = (await callSlowGeminiWithThinkingAndSchemaAndParsed(
`${prompt}\n Reply in the language of the book. It's usually Polish or English. Your instructions are in English so you often reply in English, buts its VERY important to reply in Polish when the book is in Polish, and same goes for other languages.`,
ScenesSummariesPerChapterSchema,
- 2,
)) as ScenesSummariesPerChapter;
} catch (e) {
console.error(`Error for chapter ${chapterNum}`, e);
diff --git a/apps/pipeline/src/tools/new-tooling/get-chapter-title.spec.ts b/apps/pipeline/src/tools/new-tooling/get-chapter-title.spec.ts
new file mode 100644
index 00000000..7abdf0bd
--- /dev/null
+++ b/apps/pipeline/src/tools/new-tooling/get-chapter-title.spec.ts
@@ -0,0 +1,107 @@
+import { DOMParser, type Element as XMLElement } from "@xmldom/xmldom";
+import { describe, it, expect } from "vitest";
+import { getChapterTitle } from "./get-chapter-title";
+
+describe("getChapterTitle", () => {
+ it("should return the chapter title", () => {
+ const chapter = `Chapter 1Content 1`;
+ const parser = new DOMParser();
+ const doc = parser.parseFromString(chapter, "text/xml");
+ const root = doc.documentElement as XMLElement;
+
+ expect(getChapterTitle(root)).toBe("Chapter 1");
+ });
+
+ it("should handle hgroup with label, ordinal, and title", () => {
+ const chapter = `
+
+
+ Book
+ II
+
+
The Castle
+
+ `;
+ const parser = new DOMParser();
+ const doc = parser.parseFromString(chapter, "text/xml");
+ const root = doc.documentElement as XMLElement;
+
+ expect(getChapterTitle(root)).toBe("Book II: The Castle");
+ });
+
+ it("should handle hgroup with ordinal and title", () => {
+ const chapter = `
+
+
I
+
I Go to Styles
+
+ `;
+ const parser = new DOMParser();
+ const doc = parser.parseFromString(chapter, "text/xml");
+ const root = doc.documentElement as XMLElement;
+
+ expect(getChapterTitle(root)).toBe("I: I Go to Styles");
+ });
+
+ it("should use data-epub-type as title when no hgroup with title exists", () => {
+ const chapter = `
+
To my Mother
+ `;
+ const parser = new DOMParser();
+ const doc = parser.parseFromString(chapter, "text/xml");
+ const root = doc.documentElement as XMLElement;
+
+ expect(getChapterTitle(root)).toBe("Dedication");
+ });
+
+ it("should handle hgroup with title but no h2", () => {
+ const chapter = `
+
+
Prologue
+
+ `;
+ const parser = new DOMParser();
+ const doc = parser.parseFromString(chapter, "text/xml");
+ const root = doc.documentElement as XMLElement;
+
+ expect(getChapterTitle(root)).toBe("Prologue");
+ });
+
+ it("should handle hgroup with title and h2 but no ordinal", () => {
+ const chapter = `
+
+
+ The Opening
+ `;
+ const parser = new DOMParser();
+ const doc = parser.parseFromString(chapter, "text/xml");
+ const root = doc.documentElement as XMLElement;
+
+ expect(getChapterTitle(root)).toBe("Act I, The Opening");
+ });
+
+ it("should handle legacy chapter with title and subtitle", () => {
+ const chapter = `
+ Chapter One.
+ In which our hero begins
+ `;
+ const parser = new DOMParser();
+ const doc = parser.parseFromString(chapter, "text/xml");
+ const root = doc.documentElement as XMLElement;
+
+ expect(getChapterTitle(root)).toBe("Chapter One, In which our hero begins");
+ });
+});
diff --git a/apps/pipeline/src/tools/new-tooling/get-chapter-title.ts b/apps/pipeline/src/tools/new-tooling/get-chapter-title.ts
index 5ef509f2..fd9bc463 100644
--- a/apps/pipeline/src/tools/new-tooling/get-chapter-title.ts
+++ b/apps/pipeline/src/tools/new-tooling/get-chapter-title.ts
@@ -2,16 +2,75 @@ import { type Element as XMLElement } from "@xmldom/xmldom";
const getTitleText = (el?: XMLElement | null) => (el ? (el.textContent || "").trim() : "");
-export const getChapterTitle = (chapter: XMLElement): string => {
- let currentAct = "";
+const getAttribute = (el: XMLElement, name: string): string | null => {
+ const attr = el.getAttribute(name);
+ return attr ? attr.trim() : null;
+};
+
+const hasEpubType = (el: XMLElement, type: string): boolean => {
+ const epubType = getAttribute(el, "data-epub-type");
+ return epubType ? epubType.includes(type) : false;
+};
+
+const extractLabelAndOrdinalFromSpans = (h2: XMLElement): { label: string; ordinal: string } => {
+ let label = "";
+ let ordinal = "";
+
+ const spans = h2.getElementsByTagName("span");
+ for (let i = 0; i < spans.length; i++) {
+ const span = spans[i] as XMLElement;
+ const spanEpubType = getAttribute(span, "data-epub-type");
+ if (spanEpubType === "label") {
+ label = getTitleText(span);
+ } else if (spanEpubType && spanEpubType.includes("ordinal")) {
+ ordinal = getTitleText(span);
+ }
+ }
- if (chapter.getElementsByTagName("h2").length > 0) {
- console.warn("h2 found in chapter, not supported yet");
+ return { label, ordinal };
+};
+
+const formatTitleWithOrdinal = (label: string, ordinal: string, title: string): string => {
+ if (label && ordinal) {
+ return `${label} ${ordinal}: ${title}`;
}
- if (chapter.getElementsByTagName("h1").length > 0) {
- console.warn("h1 found in chapter, not supported yet");
+ if (ordinal) {
+ return `${ordinal}: ${title}`;
+ }
+ return title;
+};
+
+const getTitleFromHgroup = (hgroup: XMLElement): string | null => {
+ const titleParagraphs = Array.from(hgroup.getElementsByTagName("p")).filter((p) =>
+ hasEpubType(p as XMLElement, "title"),
+ );
+
+ if (titleParagraphs.length === 0) {
+ return null;
}
+ const titleText = getTitleText(titleParagraphs[0] as XMLElement);
+ const h2Elements = hgroup.getElementsByTagName("h2");
+
+ if (h2Elements.length === 0) {
+ return titleText;
+ }
+
+ const h2 = h2Elements[0] as XMLElement;
+
+ // Check if h2 itself has ordinal attribute
+ if (hasEpubType(h2, "ordinal")) {
+ return formatTitleWithOrdinal("", getTitleText(h2), titleText);
+ }
+
+ // Check for spans within h2
+ const { label, ordinal } = extractLabelAndOrdinalFromSpans(h2);
+ return formatTitleWithOrdinal(label, ordinal, titleText);
+};
+
+const getLegacyChapterTitle = (chapter: XMLElement): string => {
+ let currentAct = "";
+
const actElements =
chapter.getElementsByTagName("h3").length > 0
? chapter.getElementsByTagName("h3")
@@ -31,16 +90,36 @@ export const getChapterTitle = (chapter: XMLElement): string => {
currentAct = getTitleText(actElements[0]);
}
+ console.log(`titleElements: ${titleElements.length}`);
+
const titleText = getTitleText(titleElements[0]);
const subtitleText = getTitleText(subtitleElements[0]);
- const chapterTitle = [
+ return [
currentAct,
titleText && subtitleText ? titleText.replace(/\.$/, "") : titleText,
subtitleText,
]
.filter(Boolean)
.join(", ");
+};
+
+export const getChapterTitle = (chapter: XMLElement): string => {
+ // Check for hgroup structure first
+ const hgroups = chapter.getElementsByTagName("hgroup");
+ if (hgroups.length > 0) {
+ const hgroupTitle = getTitleFromHgroup(hgroups[0] as XMLElement);
+ if (hgroupTitle) {
+ return hgroupTitle;
+ }
+ }
+
+ // If no hgroup with title, check for data-epub-type on the root element
+ const epubType = getAttribute(chapter, "data-epub-type");
+ if (epubType) {
+ return epubType.charAt(0).toUpperCase() + epubType.slice(1);
+ }
- return chapterTitle;
+ // Fall back to existing logic for backward compatibility
+ return getLegacyChapterTitle(chapter);
};
diff --git a/apps/pipeline/src/tools/new-tooling/get-reference-cards-for-whole-book-prompt.md b/apps/pipeline/src/tools/new-tooling/get-reference-cards-for-whole-book-prompt.md
index 5ab75ae5..519b60bf 100644
--- a/apps/pipeline/src/tools/new-tooling/get-reference-cards-for-whole-book-prompt.md
+++ b/apps/pipeline/src/tools/new-tooling/get-reference-cards-for-whole-book-prompt.md
@@ -1,7 +1,7 @@
# Task
Process a text chapter by chapter. For each story character who appears or is mentioned, create reference cards reflecting the knowledge about this person - how would a human introduce someone to that character without spoiling it. The information should be based mostly on when we first meet the character, but pointing towards the knowledge we know about him from the whole book - so avoid spoilers, but use the later revealed facts to determine whats important about the initial impression.
-Maybe when we first meet the character he is working on his car in his garage. If in the rest of his book he does that from time to time, or we learn he is a mechanic, or a driver, or whatever like that, that's important detail. But if he is not mentioned in the context of cars again, that is irrelevant detail. No spoilers! Do not mention how things end or who they become. Only the most generic but relevant information. So skip anything that's surprising or important action that happened in the book, but build the background about the person. Who that person was at when the story starts. Do not mention any important life changes, like getting married, dying, getting a promotion, unless it happened at the very moment we learn about that person.
+Maybe when we first meet the character he is working on his car in his garage. If in the rest of his book he does that from time to time, or we learn he is a mechanic, or a driver, or whatever like that, that's important detail. But if he is not mentioned in the context of cars again, that is irrelevant detail. No spoilers! Do not mention how things end or who they become. Only the most generic but relevant information. So skip anything that's surprising or important action that happened later in the book, but build the background about the person. Who that person was when the story starts. Do not mention any important life changes, like getting married, dying, getting a promotion, unless it happened at the very moment we learn about that person.
## **Output Goal: Character-Centric History**
@@ -39,7 +39,6 @@ Return the _final, complete_ results after processing _all_ chapters in the foll
- Focus on reminding the reader who the character _is_ based on past context (role, relationships, key history), not what they _do_ or _say_ in the current chapter (N). Avoid spoilers!
- Do not write more than 1-2 short sentences about the person. This is not a summary of a book, this is a memory-jog to quickly get someone to connect character name with the actual character.
- If the person is known by two names, add the second one in the parentheses.
-- This is not a summary of a book, this is a memory-jog to quickly get someone to connect character name with the actual character.
- Jeśli tekst jest po Polsku, odpowiedz po Polsku
## Book text
diff --git a/apps/pipeline/src/tools/new-tooling/get-reference-cards-for-whole-book.ts b/apps/pipeline/src/tools/new-tooling/get-reference-cards-for-whole-book.ts
index ccd9e92e..73f2bfdb 100644
--- a/apps/pipeline/src/tools/new-tooling/get-reference-cards-for-whole-book.ts
+++ b/apps/pipeline/src/tools/new-tooling/get-reference-cards-for-whole-book.ts
@@ -52,7 +52,17 @@ ${knownCharactersMapped}\n\n`
console.log("combinedPrompt length:", combinedPrompt.length);
- return callGrokWithSchema(combinedPrompt, NewReferenceCardsResponseSchema);
+ const response = await callGrokWithSchema(combinedPrompt, NewReferenceCardsResponseSchema);
+
+ // Add synthetic generic-avatar for unknown/minor speakers
+ // This will get an avatar generated but won't be passed to the rewrite prompts
+ response.characters.push({
+ name: "generic-avatar",
+ referenceCard:
+ "A mysterious figure shown from behind or in silhouette. No distinct facial features visible. Enigmatic and anonymous, suitable for representing any unnamed character. Atmospheric lighting with the figure partially obscured by shadow or mist.",
+ });
+
+ return response;
};
if (require.main === module) {
diff --git a/apps/pipeline/src/tools/new-tooling/restore-unwrapped-blocks.spec.ts b/apps/pipeline/src/tools/new-tooling/restore-unwrapped-blocks.spec.ts
new file mode 100644
index 00000000..9fa1dabf
--- /dev/null
+++ b/apps/pipeline/src/tools/new-tooling/restore-unwrapped-blocks.spec.ts
@@ -0,0 +1,36 @@
+import { describe, expect, it } from "vitest";
+import { restoreUnwrappedBlocks } from "./restore-unwrapped-blocks";
+
+describe("restoreUnwrappedBlocks", () => {
+ it("wraps dangling text and inline nodes using the original block element", () => {
+ const original = "
diff --git a/apps/player/src/helpers/minorCharacterUtils.ts b/apps/player/src/helpers/minorCharacterUtils.ts
new file mode 100644
index 00000000..b1e13229
--- /dev/null
+++ b/apps/player/src/helpers/minorCharacterUtils.ts
@@ -0,0 +1,24 @@
+/**
+ * Utilities for handling unknown/minor characters that are not in characterMetadata.
+ * Unknown characters are identified by speakers that have a data-speaker attribute
+ * but don't match any known character slug in the book's character list.
+ */
+
+/**
+ * Check if a character slug is for an unknown/minor character
+ * by checking if it exists in the set of known character slugs.
+ */
+export function isUnknownCharacter(slug: string, knownSlugs: Set): boolean {
+ return !knownSlugs.has(slug);
+}
+
+/**
+ * Convert a slug to a human-readable display name.
+ * "tall-soldier-at-gate" -> "Tall Soldier At Gate"
+ */
+export function slugToDisplayName(slug: string): string {
+ return slug
+ .split("-")
+ .map((word) => word.charAt(0).toUpperCase() + word.slice(1))
+ .join(" ");
+}
diff --git a/apps/player/src/locales/en/translation.json b/apps/player/src/locales/en/translation.json
index 135a1831..fe724cde 100644
--- a/apps/player/src/locales/en/translation.json
+++ b/apps/player/src/locales/en/translation.json
@@ -237,7 +237,7 @@
"chapters": "Chapters",
"paragraph": "Paragraph",
"chapter_percent": "Chapter Percent",
- "of_chapter": "of Chapter",
+ "of_chapter": "of",
"book_settings": "Book Settings",
"open_chapter": "Open Chapter",
"back_to_platform": "Back to Platform",
diff --git a/apps/player/src/locales/pl/translation.json b/apps/player/src/locales/pl/translation.json
index 2e16f837..b1e3b3a6 100644
--- a/apps/player/src/locales/pl/translation.json
+++ b/apps/player/src/locales/pl/translation.json
@@ -255,7 +255,7 @@
"chapters": "Rozdziały",
"paragraph": "Paragraf",
"chapter_percent": "Procent w Rozdziale",
- "of_chapter": "rozdziału",
+ "of_chapter": "w",
"book_settings": "Ustawienia Książki",
"open_chapter": "Otwórz Rozdział",
"back_to_platform": "Powrót do Platformy",
diff --git a/apps/player/src/services/__tests__/formatB.test.ts b/apps/player/src/services/__tests__/formatB.test.ts
index cdf9bf88..2e5152a9 100644
--- a/apps/player/src/services/__tests__/formatB.test.ts
+++ b/apps/player/src/services/__tests__/formatB.test.ts
@@ -180,6 +180,26 @@ describe("Format B", () => {
// Pure em paragraphs should become didaskalia
expect(result).toContain('data-is-didaskalia="true"');
});
+
+ it("marks pure em paragraphs inside format B speaker blocks as didaskalia", () => {
+ const input = `
+
+
To know my deed, 'twere best not know myself.
+
Knocking within
+
Wake Duncan with thy knocking! I would thou couldst!
without opening it", () => {
+ const original = [
+ "
'Then you must know where you found it?'
",
+ "
'Yes, it was on the prisoner's wardrobe.'
",
+ "
'That is better.'
",
+ ].join("\n");
+
+ const model = [
+ "
'Then you must know where you found it?'
",
+ "'Yes, it was on the prisoner's wardrobe.'",
+ "
'That is better.'
",
+ ].join("\n");
+
+ const expected = [
+ "
'Then you must know where you found it?'
",
+ "
'Yes, it was on the prisoner's wardrobe.'
",
+ "
'That is better.'
",
+ ].join("\n");
+
+ expect(restoreUnwrappedLines(original, model)).toBe(expected);
+ });
+
+ it("repairs orphan when model output is a single line", () => {
+ const original = [
+ "
'Then you must know where you found it?'
",
+ "
'Yes, it was on the prisoner's wardrobe.'
",
+ "
'That is better.'
",
+ ].join("\n");
+
+ const model =
+ "
'Then you must know where you found it?'
" +
+ "'Yes, it was on the prisoner's wardrobe.'" +
+ "
'That is better.'
";
+
+ const expected = [
+ "
'Then you must know where you found it?'
",
+ "
'Yes, it was on the prisoner's wardrobe.'
",
+ "
'That is better.'
",
+ ].join("\n");
+
+ expect(restoreUnwrappedLines(original, model)).toBe(expected);
+ });
+
+ it("wraps multiple consecutive bare lines using original wrappers", () => {
+ const original = ["
"].join("\n");
+
+ expect(restoreUnwrappedLines(original, model)).toBe(expected);
+ });
+
+ it("wraps a bare line with inline tags and trailing ", () => {
+ const original = ["
She saw the prisoner.
", "
It was unexpected.
"].join("\n");
+
+ const model = [
+ "
She saw the prisoner.
",
+ ' It was unexpected.',
+ ].join("\n");
+
+ const expected = [
+ "