devnen · 4-alok · Nov 11, 2025 · Nov 11, 2025
diff --git a/.python-version b/.python-version
@@ -0,0 +1 @@
+3.10
diff --git a/config.py b/config.py
@@ -50,6 +50,7 @@
     },
     "model": {  # Added section for model source configuration
         "repo_id": "ResembleAI/chatterbox",  # Default Hugging Face repository ID for the model
+        "use_multilingual": True,  # Use multilingual model for 23 languages support (default: True)
     },
     "tts_engine": {
         "device": "auto",  # TTS processing device: 'auto', 'cuda', 'mps', or 'cpu'.
@@ -72,11 +73,11 @@
     },
     "generation_defaults": {  # Default parameters for TTS audio generation.
         "temperature": 0.8,  # Controls randomness: lower is more deterministic.
-        "exaggeration": 0.5,  # Controls expressiveness or exaggeration in speech.
+        "exaggeration": 0.5,  # Controls expressiveness or exaggeration in speech. Range: 0.5 (subdued) to 3.0 (very dramatic).
         "cfg_weight": 0.5,  # Classifier-Free Guidance weight, influences adherence to prompt/style.
         "seed": 0,  # Random seed for generation. 0 often means random or engine default.
         "speed_factor": 1.0,  # Controls the speed of the generated speech.
-        "language": "en",  # Default language for TTS.
+        "language": "en",  # Default language for TTS. Supported: ar (Arabic), da (Danish), de (German), el (Greek), en (English), es (Spanish), fi (Finnish), fr (French), he (Hebrew), hi (Hindi), it (Italian), ja (Japanese), ko (Korean), ms (Malay), nl (Dutch), no (Norwegian), pl (Polish), pt (Portuguese), ru (Russian), sv (Swedish), sw (Swahili), tr (Turkish), zh (Chinese). Note: Quality may vary by language.
     },
     "audio_output": {  # Settings related to the format of generated audio.
         "format": "wav",  # Output audio format (e.g., 'wav', 'mp3').

diff --git a/config.yaml b/config.yaml
@@ -1,6 +1,6 @@
 server:
   host: 0.0.0.0
-  port: 8004
+  port: 8000
   use_ngrok: false
   use_auth: false
   auth_username: user
@@ -10,17 +10,18 @@ server:
   log_file_backup_count: 5
 model:
   repo_id: ResembleAI/chatterbox
+  use_multilingual: true
 tts_engine:
-  device: cuda
+  device: mps
   predefined_voices_path: voices
   reference_audio_path: reference_audio
-  default_voice_id: Emily.wav
+  default_voice_id: default_sample.wav
 paths:
   model_cache: model_cache
   output: outputs
 generation_defaults:
   temperature: 0.8
-  exaggeration: 1.3
+  exaggeration: 0.5
   cfg_weight: 0.5
   seed: 0
   speed_factor: 1.0
@@ -30,24 +31,39 @@ audio_output:
   sample_rate: 24000
   max_reference_duration_sec: 30
 ui_state:
-  last_text: 'Are you tired of slow, unreliable connections? Upgrade today to Quantum
-    Fiber, the fastest internet in the galaxy! Experience seamless streaming, lag-free
-    gaming, and instant downloads. Call now and get your first three months half price!
-    Don''t wait, this offer won''t last forever!
-
-    '
-  last_voice_mode: predefined
-  last_predefined_voice: Emily.wav
-  last_reference_file: Gianna.wav
-  last_seed: 3000
-  last_chunk_size: 240
+  last_text:
+    "\u092E\u0948\u0902 \u0938\u093E\u0915\u094D\u0937\u0940 \u0906\u0928\
+    \u0902\u0926 \u0939\u0942\u0901\u0964 \u092E\u0948\u0902 \u092C\u093F\u0939\u093E\
+    \u0930 \u0915\u0947 \u092A\u091F\u0928\u093E \u092E\u0947\u0902 \u092A\u0948\u0926\
+    \u093E \u0939\u0941\u0908 \u0925\u0940, \u0914\u0930 \u0905\u092C \u092A\u0941\
+    \u0923\u0947 \u092E\u0947\u0902 \u0930\u0939 \u0930\u0939\u0940 \u0939\u0942\u0901\
+    \u0964\n\n\u092E\u0948\u0902 \u092C\u0939\u0941\u0924 \u0906\u0932\u0938\u0940\
+    \ \u0932\u0921\u093C\u0915\u0940 \u0939\u0942\u0901\u0964 \u092E\u0948\u0902 \u0938\
+    \u0941\u092C\u0939 \u092C\u0939\u0941\u0924 \u0926\u0947\u0930 \u0938\u0947 \u0909\
+    \u0920\u0924\u0940 \u0939\u0942\u0901, \u092E\u0941\u091D\u0947 \u0917\u092A\u0936\
+    \u092A \u0915\u0930\u0928\u093E \u092C\u0939\u0941\u0924 \u092A\u0938\u0902\u0926\
+    \ \u0939\u0948\u0964 \u092E\u0948\u0902 \u090F\u0915 \u092E\u0930\u094D\u0926\
+    \ \u092C\u0928\u0928\u093E \u091A\u093E\u0939\u0924\u0940 \u0939\u0942\u0901\u0964\
+    \ \u092E\u0941\u091D\u0947 \u0913\u0936\u094B \u0915\u0947 \u0935\u093F\u091A\u093E\
+    \u0930 \u092C\u0939\u0941\u0924 \u092A\u0938\u0902\u0926 \u0939\u0948\u0902, \u091C\
+    \u093F\u0928\u094D\u0939\u094B\u0902\u0928\u0947 \u092E\u0941\u091D\u0947 \u0938\
+    \u093F\u0916\u093E\u092F\u093E \u0915\u093F \u092E\u0941\u0936\u094D\u0915\u093F\
+    \u0932 \u0938\u0935\u093E\u0932 \u092A\u0942\u091B\u0928\u093E \u0915\u093F\u0924\
+    \u0928\u093E \u091C\u093C\u0930\u0942\u0930\u0940 \u0939\u0948\u0964 \u0914\u0930\
+    \ \u092E\u0947\u0930\u093E \u0926\u093F\u092E\u093E\u0917 \u0918\u0941\u091F\u0928\
+    \u0947 \u092E\u0947\u0902 \u0939\u0948\u0964"
+  last_voice_mode: clone
+  last_predefined_voice: none
+  last_reference_file: recn.wav
+  last_seed: 2024
+  last_chunk_size: 250
   last_split_text_enabled: true
   hide_chunk_warning: false
-  hide_generation_warning: false
+  hide_generation_warning: true
   theme: light
 ui:
   title: Chatterbox TTS Server
   show_language_select: true
-  max_predefined_voices_in_dropdown: 50
+  max_predefined_voices_in_dropdown: 20
 debug:
   save_intermediate_audio: false
diff --git a/engine.py b/engine.py
@@ -5,10 +5,20 @@
 import random
 import numpy as np
 import torch
-from typing import Optional, Tuple
+from typing import Optional, Tuple, Union
 from pathlib import Path
 
 from chatterbox.tts import ChatterboxTTS  # Main TTS engine class
+
+# Try to import multilingual model if available (newer versions)
+try:
+    from chatterbox.mtl_tts import ChatterboxMultilingualTTS
+    MULTILINGUAL_AVAILABLE = True
+except ImportError:
+    ChatterboxMultilingualTTS = None  # type: ignore
+    MULTILINGUAL_AVAILABLE = False
+    logging.warning("Multilingual TTS model not available. Please upgrade chatterbox-tts for multilingual support.")
+
 from chatterbox.models.s3gen.const import (
     S3GEN_SR,
 )  # Default sample rate from the engine
@@ -19,11 +29,12 @@
 logger = logging.getLogger(__name__)
 
 # --- Global Module Variables ---
-chatterbox_model: Optional[ChatterboxTTS] = None
+chatterbox_model: Optional[Union[ChatterboxTTS, 'ChatterboxMultilingualTTS']] = None
 MODEL_LOADED: bool = False
 model_device: Optional[str] = (
-    None  # Stores the resolved device string ('cuda' or 'cpu')
+    None  # Stores the resolved device string ('cuda', 'mps', or 'cpu')
 )
+use_multilingual_model: bool = True  # Default to multilingual for broader language support
 
 
 def set_seed(seed_value: int):
@@ -87,12 +98,13 @@ def load_model() -> bool:
     Loads the TTS model.
     This version directly attempts to load from the Hugging Face repository (or its cache)
     using `from_pretrained`, bypassing the local `paths.model_cache` directory.
+    Automatically uses the multilingual model for broader language support.
     Updates global variables `chatterbox_model`, `MODEL_LOADED`, and `model_device`.
 
     Returns:
         bool: True if the model was loaded successfully, False otherwise.
     """
-    global chatterbox_model, MODEL_LOADED, model_device
+    global chatterbox_model, MODEL_LOADED, model_device, use_multilingual_model
 
     if MODEL_LOADED:
         logger.info("TTS model is already loaded.")
@@ -157,27 +169,60 @@ def load_model() -> bool:
         model_device = resolved_device_str
         logger.info(f"Final device selection: {model_device}")
 
-        # Get configured model_repo_id for logging and context,
-        # though from_pretrained might use its own internal default if not overridden.
-        model_repo_id_config = config_manager.get_string(
-            "model.repo_id", "ResembleAI/chatterbox"
-        )
-
+        # Check if multilingual model should be used (default: True for broader language support)
+        use_multilingual_model = config_manager.get_bool("model.use_multilingual", True)
+
+        # Check if multilingual model is actually available
+        if use_multilingual_model and not MULTILINGUAL_AVAILABLE:
+            logger.warning(
+                "Multilingual model requested but not available in current chatterbox-tts version. "
+                "Using English-only model. To enable multilingual support, upgrade chatterbox-tts: "
+                "pip install --upgrade chatterbox-tts"
+            )
+            use_multilingual_model = False
+
         logger.info(
-            f"Attempting to load model directly using from_pretrained (expected from Hugging Face repository: {model_repo_id_config} or library default)."
+            f"Attempting to load {'multilingual' if use_multilingual_model else 'English-only'} model using from_pretrained."
         )
         try:
             # Directly use from_pretrained. This will utilize the standard Hugging Face cache.
-            # The ChatterboxTTS.from_pretrained method handles downloading if the model is not in the cache.
-            chatterbox_model = ChatterboxTTS.from_pretrained(device=model_device)
-            # The actual repo ID used by from_pretrained is often internal to the library,
-            # but logging the configured one provides user context.
-            logger.info(
-                f"Successfully loaded TTS model using from_pretrained on {model_device} (expected from '{model_repo_id_config}' or library default)."
-            )
+            # The model's from_pretrained method handles downloading if the model is not in the cache.
+            if use_multilingual_model and MULTILINGUAL_AVAILABLE:
+                # Workaround for MPS/CPU: Patch torch.load to use map_location for non-CUDA devices
+                original_torch_load = torch.load
+                if model_device != "cuda":
+                    device_obj = torch.device(model_device)
+                    def patched_torch_load(f, *args, **kwargs):
+                        if 'map_location' not in kwargs:
+                            kwargs['map_location'] = device_obj
+                        return original_torch_load(f, *args, **kwargs)
+                    torch.load = patched_torch_load
+
+                try:
+                    chatterbox_model = ChatterboxMultilingualTTS.from_pretrained(device=model_device)
+
+                    # Fix for MPS: Set attention implementation to 'eager' to avoid SDPA issues
+                    if hasattr(chatterbox_model, 't3') and hasattr(chatterbox_model.t3, 'tfmr'):
+                        try:
+                            chatterbox_model.t3.tfmr.config._attn_implementation = 'eager'
+                            logger.info("Set attention implementation to 'eager' for MPS compatibility")
+                        except Exception as e:
+                            logger.warning(f"Could not set attention implementation: {e}")
+
+                    logger.info(
+                        f"Successfully loaded Multilingual TTS model on {model_device}. Supports 23 languages including Hindi."
+                    )
+                finally:
+                    # Restore original torch.load
+                    torch.load = original_torch_load
+            else:
+                chatterbox_model = ChatterboxTTS.from_pretrained(device=model_device)
+                logger.info(
+                    f"Successfully loaded English-only TTS model on {model_device}."
+                )
         except Exception as e_hf:
             logger.error(
-                f"Failed to load model using from_pretrained (expected from '{model_repo_id_config}' or library default): {e_hf}",
+                f"Failed to load {'multilingual' if use_multilingual_model else 'English-only'} model: {e_hf}",
                 exc_info=True,
             )
             chatterbox_model = None
@@ -214,6 +259,7 @@ def synthesize(
     exaggeration: float = 0.5,
     cfg_weight: float = 0.5,
     seed: int = 0,
+    language_id: Optional[str] = None,
 ) -> Tuple[Optional[torch.Tensor], Optional[int]]:
     """
     Synthesizes audio from text using the loaded TTS model.
@@ -226,12 +272,14 @@ def synthesize(
         cfg_weight: Classifier-Free Guidance weight.
         seed: Random seed for generation. If 0, default randomness is used.
               If non-zero, a global seed is set for reproducibility.
+        language_id: Language code for multilingual model (e.g., 'hi' for Hindi, 'en' for English).
+                     Only used with multilingual model. If None, defaults to config language.
 
     Returns:
         A tuple containing the audio waveform (torch.Tensor) and the sample rate (int),
         or (None, None) if synthesis fails.
     """
-    global chatterbox_model
+    global chatterbox_model, use_multilingual_model
 
     if not MODEL_LOADED or chatterbox_model is None:
         logger.error("TTS model is not loaded. Cannot synthesize audio.")
@@ -249,19 +297,40 @@ def synthesize(
 
         logger.debug(
             f"Synthesizing with params: audio_prompt='{audio_prompt_path}', temp={temperature}, "
-            f"exag={exaggeration}, cfg_weight={cfg_weight}, seed_applied_globally_if_nonzero={seed}"
+            f"exag={exaggeration}, cfg_weight={cfg_weight}, seed_applied_globally_if_nonzero={seed}, "
+            f"language_id={language_id}"
         )
 
         # Call the core model's generate method
-        wav_tensor = chatterbox_model.generate(
-            text=text,
-            audio_prompt_path=audio_prompt_path,
-            temperature=temperature,
-            exaggeration=exaggeration,
-            cfg_weight=cfg_weight,
-        )
+        # For multilingual model, include language_id parameter if available
+        if use_multilingual_model and MULTILINGUAL_AVAILABLE and isinstance(chatterbox_model, ChatterboxMultilingualTTS):
+            # Use provided language_id or default from config
+            effective_language = language_id or config_manager.get_string("generation_defaults.language", "en")
+            logger.info(f"Generating speech for language: {effective_language}")
+            wav_tensor = chatterbox_model.generate(
+                text=text,
+                audio_prompt_path=audio_prompt_path,
+                temperature=temperature,
+                exaggeration=exaggeration,
+                cfg_weight=cfg_weight,
+                language_id=effective_language,
+            )
+        else:
+            # English-only model doesn't use language_id parameter
+            if language_id and language_id != "en":
+                logger.warning(
+                    f"Language '{language_id}' requested but multilingual model not available. "
+                    "Generating in English. Upgrade chatterbox-tts for multilingual support."
+                )
+            wav_tensor = chatterbox_model.generate(
+                text=text,
+                audio_prompt_path=audio_prompt_path,
+                temperature=temperature,
+                exaggeration=exaggeration,
+                cfg_weight=cfg_weight,
+            )
 
-        # The ChatterboxTTS.generate method already returns a CPU tensor.
+        # The model's generate method already returns a CPU tensor.
         return wav_tensor, chatterbox_model.sr
 
     except Exception as e:

diff --git a/main.py b/main.py
@@ -0,0 +1,6 @@
+def main():
+    print("Hello from chatterbox-tts-server!")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/models.py b/models.py
@@ -16,9 +16,9 @@ class GenerationParams(BaseModel):
     )
     exaggeration: Optional[float] = Field(
         None,
-        ge=0.25,  # Based on Chatterbox Gradio app
-        le=2.0,  # Based on Chatterbox Gradio app
-        description="Controls expressiveness/exaggeration. (Range: 0.25-2.0)",
+        ge=0.5,  # Based on Chatterbox tutorial notebook
+        le=3.0,  # Based on Chatterbox tutorial notebook
+        description="Controls expressiveness/exaggeration. Lower values are more subdued/calm, higher values are more dramatic/energetic. (Range: 0.5-3.0)",
     )
     cfg_weight: Optional[float] = Field(
         None,

diff --git a/pyproject.toml b/pyproject.toml
@@ -0,0 +1,7 @@
+[project]
+name = "chatterbox-tts-server"
+version = "0.1.0"
+description = "Add your description here"
+readme = "README.md"
+requires-python = ">=3.10"
+dependencies = []
diff --git a/requirements.txt b/requirements.txt
@@ -23,7 +23,7 @@ fastapi                         # Modern async web framework
 uvicorn[standard]               # ASGI server with performance extras
 
 # --- Scientific Computing & ML Libraries ---
-numpy==1.26.4                   # Fundamental numerical computing
+numpy                # Fundamental numerical computing
 librosa                         # Advanced audio/music analysis
 safetensors                     # Safe tensor serialization format
 descript-audio-codec            # Audio codec for ML applications

diff --git a/run.bash b/run.bash
@@ -0,0 +1 @@
+source .venv/bin/activate && python server.py
diff --git a/server.py b/server.py
@@ -77,6 +77,7 @@ class OpenAISpeechRequest(BaseModel):
     response_format: Literal["wav", "opus", "mp3"] = "wav"  # Add "mp3"
     speed: float = 1.0
     seed: Optional[int] = None
+    language: Optional[str] = None  # Added language support for multilingual
 
 
 # --- Logging Configuration ---
@@ -758,6 +759,11 @@ async def custom_tts_endpoint(
                 seed=(
                     request.seed if request.seed is not None else get_gen_default_seed()
                 ),
+                language_id=(
+                    request.language
+                    if request.language is not None
+                    else get_gen_default_language()
+                ),
             )
             perf_monitor.record(f"Engine synthesized chunk {i+1}")
 
@@ -936,6 +942,11 @@ async def openai_speech_endpoint(request: OpenAISpeechRequest):
             exaggeration=get_gen_default_exaggeration(),
             cfg_weight=get_gen_default_cfg_weight(),
             seed=seed_to_use,
+            language_id=(
+                request.language
+                if request.language is not None
+                else get_gen_default_language()
+            ),
         )
 
         if audio_tensor is None or sr is None: