Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .python-version
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
3.10
5 changes: 3 additions & 2 deletions config.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@
},
"model": { # Added section for model source configuration
"repo_id": "ResembleAI/chatterbox", # Default Hugging Face repository ID for the model
"use_multilingual": True, # Use multilingual model for 23 languages support (default: True)
},
"tts_engine": {
"device": "auto", # TTS processing device: 'auto', 'cuda', 'mps', or 'cpu'.
Expand All @@ -72,11 +73,11 @@
},
"generation_defaults": { # Default parameters for TTS audio generation.
"temperature": 0.8, # Controls randomness: lower is more deterministic.
"exaggeration": 0.5, # Controls expressiveness or exaggeration in speech.
"exaggeration": 0.5, # Controls expressiveness or exaggeration in speech. Range: 0.5 (subdued) to 3.0 (very dramatic).
"cfg_weight": 0.5, # Classifier-Free Guidance weight, influences adherence to prompt/style.
"seed": 0, # Random seed for generation. 0 often means random or engine default.
"speed_factor": 1.0, # Controls the speed of the generated speech.
"language": "en", # Default language for TTS.
"language": "en", # Default language for TTS. Supported: ar (Arabic), da (Danish), de (German), el (Greek), en (English), es (Spanish), fi (Finnish), fr (French), he (Hebrew), hi (Hindi), it (Italian), ja (Japanese), ko (Korean), ms (Malay), nl (Dutch), no (Norwegian), pl (Polish), pt (Portuguese), ru (Russian), sv (Swedish), sw (Swahili), tr (Turkish), zh (Chinese). Note: Quality may vary by language.
},
"audio_output": { # Settings related to the format of generated audio.
"format": "wav", # Output audio format (e.g., 'wav', 'mp3').
Expand Down
50 changes: 33 additions & 17 deletions config.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
server:
host: 0.0.0.0
port: 8004
port: 8000
use_ngrok: false
use_auth: false
auth_username: user
Expand All @@ -10,17 +10,18 @@ server:
log_file_backup_count: 5
model:
repo_id: ResembleAI/chatterbox
use_multilingual: true
tts_engine:
device: cuda
device: mps
predefined_voices_path: voices
reference_audio_path: reference_audio
default_voice_id: Emily.wav
default_voice_id: default_sample.wav
paths:
model_cache: model_cache
output: outputs
generation_defaults:
temperature: 0.8
exaggeration: 1.3
exaggeration: 0.5
cfg_weight: 0.5
seed: 0
speed_factor: 1.0
Expand All @@ -30,24 +31,39 @@ audio_output:
sample_rate: 24000
max_reference_duration_sec: 30
ui_state:
last_text: 'Are you tired of slow, unreliable connections? Upgrade today to Quantum
Fiber, the fastest internet in the galaxy! Experience seamless streaming, lag-free
gaming, and instant downloads. Call now and get your first three months half price!
Don''t wait, this offer won''t last forever!

'
last_voice_mode: predefined
last_predefined_voice: Emily.wav
last_reference_file: Gianna.wav
last_seed: 3000
last_chunk_size: 240
last_text:
"\u092E\u0948\u0902 \u0938\u093E\u0915\u094D\u0937\u0940 \u0906\u0928\
\u0902\u0926 \u0939\u0942\u0901\u0964 \u092E\u0948\u0902 \u092C\u093F\u0939\u093E\
\u0930 \u0915\u0947 \u092A\u091F\u0928\u093E \u092E\u0947\u0902 \u092A\u0948\u0926\
\u093E \u0939\u0941\u0908 \u0925\u0940, \u0914\u0930 \u0905\u092C \u092A\u0941\
\u0923\u0947 \u092E\u0947\u0902 \u0930\u0939 \u0930\u0939\u0940 \u0939\u0942\u0901\
\u0964\n\n\u092E\u0948\u0902 \u092C\u0939\u0941\u0924 \u0906\u0932\u0938\u0940\
\ \u0932\u0921\u093C\u0915\u0940 \u0939\u0942\u0901\u0964 \u092E\u0948\u0902 \u0938\
\u0941\u092C\u0939 \u092C\u0939\u0941\u0924 \u0926\u0947\u0930 \u0938\u0947 \u0909\
\u0920\u0924\u0940 \u0939\u0942\u0901, \u092E\u0941\u091D\u0947 \u0917\u092A\u0936\
\u092A \u0915\u0930\u0928\u093E \u092C\u0939\u0941\u0924 \u092A\u0938\u0902\u0926\
\ \u0939\u0948\u0964 \u092E\u0948\u0902 \u090F\u0915 \u092E\u0930\u094D\u0926\
\ \u092C\u0928\u0928\u093E \u091A\u093E\u0939\u0924\u0940 \u0939\u0942\u0901\u0964\
\ \u092E\u0941\u091D\u0947 \u0913\u0936\u094B \u0915\u0947 \u0935\u093F\u091A\u093E\
\u0930 \u092C\u0939\u0941\u0924 \u092A\u0938\u0902\u0926 \u0939\u0948\u0902, \u091C\
\u093F\u0928\u094D\u0939\u094B\u0902\u0928\u0947 \u092E\u0941\u091D\u0947 \u0938\
\u093F\u0916\u093E\u092F\u093E \u0915\u093F \u092E\u0941\u0936\u094D\u0915\u093F\
\u0932 \u0938\u0935\u093E\u0932 \u092A\u0942\u091B\u0928\u093E \u0915\u093F\u0924\
\u0928\u093E \u091C\u093C\u0930\u0942\u0930\u0940 \u0939\u0948\u0964 \u0914\u0930\
\ \u092E\u0947\u0930\u093E \u0926\u093F\u092E\u093E\u0917 \u0918\u0941\u091F\u0928\
\u0947 \u092E\u0947\u0902 \u0939\u0948\u0964"
last_voice_mode: clone
last_predefined_voice: none
last_reference_file: recn.wav
last_seed: 2024
last_chunk_size: 250
last_split_text_enabled: true
hide_chunk_warning: false
hide_generation_warning: false
hide_generation_warning: true
theme: light
ui:
title: Chatterbox TTS Server
show_language_select: true
max_predefined_voices_in_dropdown: 50
max_predefined_voices_in_dropdown: 20
debug:
save_intermediate_audio: false
127 changes: 98 additions & 29 deletions engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,20 @@
import random
import numpy as np
import torch
from typing import Optional, Tuple
from typing import Optional, Tuple, Union
from pathlib import Path

from chatterbox.tts import ChatterboxTTS # Main TTS engine class

# Try to import multilingual model if available (newer versions)
try:
from chatterbox.mtl_tts import ChatterboxMultilingualTTS
MULTILINGUAL_AVAILABLE = True
except ImportError:
ChatterboxMultilingualTTS = None # type: ignore
MULTILINGUAL_AVAILABLE = False
logging.warning("Multilingual TTS model not available. Please upgrade chatterbox-tts for multilingual support.")

from chatterbox.models.s3gen.const import (
S3GEN_SR,
) # Default sample rate from the engine
Expand All @@ -19,11 +29,12 @@
logger = logging.getLogger(__name__)

# --- Global Module Variables ---
chatterbox_model: Optional[ChatterboxTTS] = None
chatterbox_model: Optional[Union[ChatterboxTTS, 'ChatterboxMultilingualTTS']] = None
MODEL_LOADED: bool = False
model_device: Optional[str] = (
None # Stores the resolved device string ('cuda' or 'cpu')
None # Stores the resolved device string ('cuda', 'mps', or 'cpu')
)
use_multilingual_model: bool = True # Default to multilingual for broader language support


def set_seed(seed_value: int):
Expand Down Expand Up @@ -87,12 +98,13 @@ def load_model() -> bool:
Loads the TTS model.
This version directly attempts to load from the Hugging Face repository (or its cache)
using `from_pretrained`, bypassing the local `paths.model_cache` directory.
Automatically uses the multilingual model for broader language support.
Updates global variables `chatterbox_model`, `MODEL_LOADED`, and `model_device`.

Returns:
bool: True if the model was loaded successfully, False otherwise.
"""
global chatterbox_model, MODEL_LOADED, model_device
global chatterbox_model, MODEL_LOADED, model_device, use_multilingual_model

if MODEL_LOADED:
logger.info("TTS model is already loaded.")
Expand Down Expand Up @@ -157,27 +169,60 @@ def load_model() -> bool:
model_device = resolved_device_str
logger.info(f"Final device selection: {model_device}")

# Get configured model_repo_id for logging and context,
# though from_pretrained might use its own internal default if not overridden.
model_repo_id_config = config_manager.get_string(
"model.repo_id", "ResembleAI/chatterbox"
)

# Check if multilingual model should be used (default: True for broader language support)
use_multilingual_model = config_manager.get_bool("model.use_multilingual", True)

# Check if multilingual model is actually available
if use_multilingual_model and not MULTILINGUAL_AVAILABLE:
logger.warning(
"Multilingual model requested but not available in current chatterbox-tts version. "
"Using English-only model. To enable multilingual support, upgrade chatterbox-tts: "
"pip install --upgrade chatterbox-tts"
)
use_multilingual_model = False

logger.info(
f"Attempting to load model directly using from_pretrained (expected from Hugging Face repository: {model_repo_id_config} or library default)."
f"Attempting to load {'multilingual' if use_multilingual_model else 'English-only'} model using from_pretrained."
)
try:
# Directly use from_pretrained. This will utilize the standard Hugging Face cache.
# The ChatterboxTTS.from_pretrained method handles downloading if the model is not in the cache.
chatterbox_model = ChatterboxTTS.from_pretrained(device=model_device)
# The actual repo ID used by from_pretrained is often internal to the library,
# but logging the configured one provides user context.
logger.info(
f"Successfully loaded TTS model using from_pretrained on {model_device} (expected from '{model_repo_id_config}' or library default)."
)
# The model's from_pretrained method handles downloading if the model is not in the cache.
if use_multilingual_model and MULTILINGUAL_AVAILABLE:
# Workaround for MPS/CPU: Patch torch.load to use map_location for non-CUDA devices
original_torch_load = torch.load
if model_device != "cuda":
device_obj = torch.device(model_device)
def patched_torch_load(f, *args, **kwargs):
if 'map_location' not in kwargs:
kwargs['map_location'] = device_obj
return original_torch_load(f, *args, **kwargs)
torch.load = patched_torch_load

try:
chatterbox_model = ChatterboxMultilingualTTS.from_pretrained(device=model_device)

# Fix for MPS: Set attention implementation to 'eager' to avoid SDPA issues
if hasattr(chatterbox_model, 't3') and hasattr(chatterbox_model.t3, 'tfmr'):
try:
chatterbox_model.t3.tfmr.config._attn_implementation = 'eager'
logger.info("Set attention implementation to 'eager' for MPS compatibility")
except Exception as e:
logger.warning(f"Could not set attention implementation: {e}")

logger.info(
f"Successfully loaded Multilingual TTS model on {model_device}. Supports 23 languages including Hindi."
)
finally:
# Restore original torch.load
torch.load = original_torch_load
else:
chatterbox_model = ChatterboxTTS.from_pretrained(device=model_device)
logger.info(
f"Successfully loaded English-only TTS model on {model_device}."
)
except Exception as e_hf:
logger.error(
f"Failed to load model using from_pretrained (expected from '{model_repo_id_config}' or library default): {e_hf}",
f"Failed to load {'multilingual' if use_multilingual_model else 'English-only'} model: {e_hf}",
exc_info=True,
)
chatterbox_model = None
Expand Down Expand Up @@ -214,6 +259,7 @@ def synthesize(
exaggeration: float = 0.5,
cfg_weight: float = 0.5,
seed: int = 0,
language_id: Optional[str] = None,
) -> Tuple[Optional[torch.Tensor], Optional[int]]:
"""
Synthesizes audio from text using the loaded TTS model.
Expand All @@ -226,12 +272,14 @@ def synthesize(
cfg_weight: Classifier-Free Guidance weight.
seed: Random seed for generation. If 0, default randomness is used.
If non-zero, a global seed is set for reproducibility.
language_id: Language code for multilingual model (e.g., 'hi' for Hindi, 'en' for English).
Only used with multilingual model. If None, defaults to config language.

Returns:
A tuple containing the audio waveform (torch.Tensor) and the sample rate (int),
or (None, None) if synthesis fails.
"""
global chatterbox_model
global chatterbox_model, use_multilingual_model

if not MODEL_LOADED or chatterbox_model is None:
logger.error("TTS model is not loaded. Cannot synthesize audio.")
Expand All @@ -249,19 +297,40 @@ def synthesize(

logger.debug(
f"Synthesizing with params: audio_prompt='{audio_prompt_path}', temp={temperature}, "
f"exag={exaggeration}, cfg_weight={cfg_weight}, seed_applied_globally_if_nonzero={seed}"
f"exag={exaggeration}, cfg_weight={cfg_weight}, seed_applied_globally_if_nonzero={seed}, "
f"language_id={language_id}"
)

# Call the core model's generate method
wav_tensor = chatterbox_model.generate(
text=text,
audio_prompt_path=audio_prompt_path,
temperature=temperature,
exaggeration=exaggeration,
cfg_weight=cfg_weight,
)
# For multilingual model, include language_id parameter if available
if use_multilingual_model and MULTILINGUAL_AVAILABLE and isinstance(chatterbox_model, ChatterboxMultilingualTTS):
# Use provided language_id or default from config
effective_language = language_id or config_manager.get_string("generation_defaults.language", "en")
logger.info(f"Generating speech for language: {effective_language}")
wav_tensor = chatterbox_model.generate(
text=text,
audio_prompt_path=audio_prompt_path,
temperature=temperature,
exaggeration=exaggeration,
cfg_weight=cfg_weight,
language_id=effective_language,
)
else:
# English-only model doesn't use language_id parameter
if language_id and language_id != "en":
logger.warning(
f"Language '{language_id}' requested but multilingual model not available. "
"Generating in English. Upgrade chatterbox-tts for multilingual support."
)
wav_tensor = chatterbox_model.generate(
text=text,
audio_prompt_path=audio_prompt_path,
temperature=temperature,
exaggeration=exaggeration,
cfg_weight=cfg_weight,
)

# The ChatterboxTTS.generate method already returns a CPU tensor.
# The model's generate method already returns a CPU tensor.
return wav_tensor, chatterbox_model.sr

except Exception as e:
Expand Down
6 changes: 6 additions & 0 deletions main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
def main():
print("Hello from chatterbox-tts-server!")


if __name__ == "__main__":
main()
6 changes: 3 additions & 3 deletions models.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,9 @@ class GenerationParams(BaseModel):
)
exaggeration: Optional[float] = Field(
None,
ge=0.25, # Based on Chatterbox Gradio app
le=2.0, # Based on Chatterbox Gradio app
description="Controls expressiveness/exaggeration. (Range: 0.25-2.0)",
ge=0.5, # Based on Chatterbox tutorial notebook
le=3.0, # Based on Chatterbox tutorial notebook
description="Controls expressiveness/exaggeration. Lower values are more subdued/calm, higher values are more dramatic/energetic. (Range: 0.5-3.0)",
)
cfg_weight: Optional[float] = Field(
None,
Expand Down
7 changes: 7 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
[project]
name = "chatterbox-tts-server"
version = "0.1.0"
description = "Add your description here"
readme = "README.md"
requires-python = ">=3.10"
dependencies = []
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ fastapi # Modern async web framework
uvicorn[standard] # ASGI server with performance extras

# --- Scientific Computing & ML Libraries ---
numpy==1.26.4 # Fundamental numerical computing
numpy # Fundamental numerical computing
librosa # Advanced audio/music analysis
safetensors # Safe tensor serialization format
descript-audio-codec # Audio codec for ML applications
Expand Down
1 change: 1 addition & 0 deletions run.bash
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
source .venv/bin/activate && python server.py
11 changes: 11 additions & 0 deletions server.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ class OpenAISpeechRequest(BaseModel):
response_format: Literal["wav", "opus", "mp3"] = "wav" # Add "mp3"
speed: float = 1.0
seed: Optional[int] = None
language: Optional[str] = None # Added language support for multilingual


# --- Logging Configuration ---
Expand Down Expand Up @@ -758,6 +759,11 @@ async def custom_tts_endpoint(
seed=(
request.seed if request.seed is not None else get_gen_default_seed()
),
language_id=(
request.language
if request.language is not None
else get_gen_default_language()
),
)
perf_monitor.record(f"Engine synthesized chunk {i+1}")

Expand Down Expand Up @@ -936,6 +942,11 @@ async def openai_speech_endpoint(request: OpenAISpeechRequest):
exaggeration=get_gen_default_exaggeration(),
cfg_weight=get_gen_default_cfg_weight(),
seed=seed_to_use,
language_id=(
request.language
if request.language is not None
else get_gen_default_language()
),
)

if audio_tensor is None or sr is None:
Expand Down
Loading