diff --git a/.python-version b/.python-version new file mode 100644 index 0000000..c8cfe39 --- /dev/null +++ b/.python-version @@ -0,0 +1 @@ +3.10 diff --git a/config.py b/config.py index 2fdea6f..91f9ac8 100644 --- a/config.py +++ b/config.py @@ -50,6 +50,7 @@ }, "model": { # Added section for model source configuration "repo_id": "ResembleAI/chatterbox", # Default Hugging Face repository ID for the model + "use_multilingual": True, # Use multilingual model for 23 languages support (default: True) }, "tts_engine": { "device": "auto", # TTS processing device: 'auto', 'cuda', 'mps', or 'cpu'. @@ -72,11 +73,11 @@ }, "generation_defaults": { # Default parameters for TTS audio generation. "temperature": 0.8, # Controls randomness: lower is more deterministic. - "exaggeration": 0.5, # Controls expressiveness or exaggeration in speech. + "exaggeration": 0.5, # Controls expressiveness or exaggeration in speech. Range: 0.5 (subdued) to 3.0 (very dramatic). "cfg_weight": 0.5, # Classifier-Free Guidance weight, influences adherence to prompt/style. "seed": 0, # Random seed for generation. 0 often means random or engine default. "speed_factor": 1.0, # Controls the speed of the generated speech. - "language": "en", # Default language for TTS. + "language": "en", # Default language for TTS. Supported: ar (Arabic), da (Danish), de (German), el (Greek), en (English), es (Spanish), fi (Finnish), fr (French), he (Hebrew), hi (Hindi), it (Italian), ja (Japanese), ko (Korean), ms (Malay), nl (Dutch), no (Norwegian), pl (Polish), pt (Portuguese), ru (Russian), sv (Swedish), sw (Swahili), tr (Turkish), zh (Chinese). Note: Quality may vary by language. }, "audio_output": { # Settings related to the format of generated audio. "format": "wav", # Output audio format (e.g., 'wav', 'mp3'). diff --git a/config.yaml b/config.yaml index f610286..4fc64f0 100644 --- a/config.yaml +++ b/config.yaml @@ -1,6 +1,6 @@ server: host: 0.0.0.0 - port: 8004 + port: 8000 use_ngrok: false use_auth: false auth_username: user @@ -10,17 +10,18 @@ server: log_file_backup_count: 5 model: repo_id: ResembleAI/chatterbox + use_multilingual: true tts_engine: - device: cuda + device: mps predefined_voices_path: voices reference_audio_path: reference_audio - default_voice_id: Emily.wav + default_voice_id: default_sample.wav paths: model_cache: model_cache output: outputs generation_defaults: temperature: 0.8 - exaggeration: 1.3 + exaggeration: 0.5 cfg_weight: 0.5 seed: 0 speed_factor: 1.0 @@ -30,24 +31,39 @@ audio_output: sample_rate: 24000 max_reference_duration_sec: 30 ui_state: - last_text: 'Are you tired of slow, unreliable connections? Upgrade today to Quantum - Fiber, the fastest internet in the galaxy! Experience seamless streaming, lag-free - gaming, and instant downloads. Call now and get your first three months half price! - Don''t wait, this offer won''t last forever! - - ' - last_voice_mode: predefined - last_predefined_voice: Emily.wav - last_reference_file: Gianna.wav - last_seed: 3000 - last_chunk_size: 240 + last_text: + "\u092E\u0948\u0902 \u0938\u093E\u0915\u094D\u0937\u0940 \u0906\u0928\ + \u0902\u0926 \u0939\u0942\u0901\u0964 \u092E\u0948\u0902 \u092C\u093F\u0939\u093E\ + \u0930 \u0915\u0947 \u092A\u091F\u0928\u093E \u092E\u0947\u0902 \u092A\u0948\u0926\ + \u093E \u0939\u0941\u0908 \u0925\u0940, \u0914\u0930 \u0905\u092C \u092A\u0941\ + \u0923\u0947 \u092E\u0947\u0902 \u0930\u0939 \u0930\u0939\u0940 \u0939\u0942\u0901\ + \u0964\n\n\u092E\u0948\u0902 \u092C\u0939\u0941\u0924 \u0906\u0932\u0938\u0940\ + \ \u0932\u0921\u093C\u0915\u0940 \u0939\u0942\u0901\u0964 \u092E\u0948\u0902 \u0938\ + \u0941\u092C\u0939 \u092C\u0939\u0941\u0924 \u0926\u0947\u0930 \u0938\u0947 \u0909\ + \u0920\u0924\u0940 \u0939\u0942\u0901, \u092E\u0941\u091D\u0947 \u0917\u092A\u0936\ + \u092A \u0915\u0930\u0928\u093E \u092C\u0939\u0941\u0924 \u092A\u0938\u0902\u0926\ + \ \u0939\u0948\u0964 \u092E\u0948\u0902 \u090F\u0915 \u092E\u0930\u094D\u0926\ + \ \u092C\u0928\u0928\u093E \u091A\u093E\u0939\u0924\u0940 \u0939\u0942\u0901\u0964\ + \ \u092E\u0941\u091D\u0947 \u0913\u0936\u094B \u0915\u0947 \u0935\u093F\u091A\u093E\ + \u0930 \u092C\u0939\u0941\u0924 \u092A\u0938\u0902\u0926 \u0939\u0948\u0902, \u091C\ + \u093F\u0928\u094D\u0939\u094B\u0902\u0928\u0947 \u092E\u0941\u091D\u0947 \u0938\ + \u093F\u0916\u093E\u092F\u093E \u0915\u093F \u092E\u0941\u0936\u094D\u0915\u093F\ + \u0932 \u0938\u0935\u093E\u0932 \u092A\u0942\u091B\u0928\u093E \u0915\u093F\u0924\ + \u0928\u093E \u091C\u093C\u0930\u0942\u0930\u0940 \u0939\u0948\u0964 \u0914\u0930\ + \ \u092E\u0947\u0930\u093E \u0926\u093F\u092E\u093E\u0917 \u0918\u0941\u091F\u0928\ + \u0947 \u092E\u0947\u0902 \u0939\u0948\u0964" + last_voice_mode: clone + last_predefined_voice: none + last_reference_file: recn.wav + last_seed: 2024 + last_chunk_size: 250 last_split_text_enabled: true hide_chunk_warning: false - hide_generation_warning: false + hide_generation_warning: true theme: light ui: title: Chatterbox TTS Server show_language_select: true - max_predefined_voices_in_dropdown: 50 + max_predefined_voices_in_dropdown: 20 debug: save_intermediate_audio: false diff --git a/engine.py b/engine.py index e42426e..b764a03 100644 --- a/engine.py +++ b/engine.py @@ -5,10 +5,20 @@ import random import numpy as np import torch -from typing import Optional, Tuple +from typing import Optional, Tuple, Union from pathlib import Path from chatterbox.tts import ChatterboxTTS # Main TTS engine class + +# Try to import multilingual model if available (newer versions) +try: + from chatterbox.mtl_tts import ChatterboxMultilingualTTS + MULTILINGUAL_AVAILABLE = True +except ImportError: + ChatterboxMultilingualTTS = None # type: ignore + MULTILINGUAL_AVAILABLE = False + logging.warning("Multilingual TTS model not available. Please upgrade chatterbox-tts for multilingual support.") + from chatterbox.models.s3gen.const import ( S3GEN_SR, ) # Default sample rate from the engine @@ -19,11 +29,12 @@ logger = logging.getLogger(__name__) # --- Global Module Variables --- -chatterbox_model: Optional[ChatterboxTTS] = None +chatterbox_model: Optional[Union[ChatterboxTTS, 'ChatterboxMultilingualTTS']] = None MODEL_LOADED: bool = False model_device: Optional[str] = ( - None # Stores the resolved device string ('cuda' or 'cpu') + None # Stores the resolved device string ('cuda', 'mps', or 'cpu') ) +use_multilingual_model: bool = True # Default to multilingual for broader language support def set_seed(seed_value: int): @@ -87,12 +98,13 @@ def load_model() -> bool: Loads the TTS model. This version directly attempts to load from the Hugging Face repository (or its cache) using `from_pretrained`, bypassing the local `paths.model_cache` directory. + Automatically uses the multilingual model for broader language support. Updates global variables `chatterbox_model`, `MODEL_LOADED`, and `model_device`. Returns: bool: True if the model was loaded successfully, False otherwise. """ - global chatterbox_model, MODEL_LOADED, model_device + global chatterbox_model, MODEL_LOADED, model_device, use_multilingual_model if MODEL_LOADED: logger.info("TTS model is already loaded.") @@ -157,27 +169,60 @@ def load_model() -> bool: model_device = resolved_device_str logger.info(f"Final device selection: {model_device}") - # Get configured model_repo_id for logging and context, - # though from_pretrained might use its own internal default if not overridden. - model_repo_id_config = config_manager.get_string( - "model.repo_id", "ResembleAI/chatterbox" - ) - + # Check if multilingual model should be used (default: True for broader language support) + use_multilingual_model = config_manager.get_bool("model.use_multilingual", True) + + # Check if multilingual model is actually available + if use_multilingual_model and not MULTILINGUAL_AVAILABLE: + logger.warning( + "Multilingual model requested but not available in current chatterbox-tts version. " + "Using English-only model. To enable multilingual support, upgrade chatterbox-tts: " + "pip install --upgrade chatterbox-tts" + ) + use_multilingual_model = False + logger.info( - f"Attempting to load model directly using from_pretrained (expected from Hugging Face repository: {model_repo_id_config} or library default)." + f"Attempting to load {'multilingual' if use_multilingual_model else 'English-only'} model using from_pretrained." ) try: # Directly use from_pretrained. This will utilize the standard Hugging Face cache. - # The ChatterboxTTS.from_pretrained method handles downloading if the model is not in the cache. - chatterbox_model = ChatterboxTTS.from_pretrained(device=model_device) - # The actual repo ID used by from_pretrained is often internal to the library, - # but logging the configured one provides user context. - logger.info( - f"Successfully loaded TTS model using from_pretrained on {model_device} (expected from '{model_repo_id_config}' or library default)." - ) + # The model's from_pretrained method handles downloading if the model is not in the cache. + if use_multilingual_model and MULTILINGUAL_AVAILABLE: + # Workaround for MPS/CPU: Patch torch.load to use map_location for non-CUDA devices + original_torch_load = torch.load + if model_device != "cuda": + device_obj = torch.device(model_device) + def patched_torch_load(f, *args, **kwargs): + if 'map_location' not in kwargs: + kwargs['map_location'] = device_obj + return original_torch_load(f, *args, **kwargs) + torch.load = patched_torch_load + + try: + chatterbox_model = ChatterboxMultilingualTTS.from_pretrained(device=model_device) + + # Fix for MPS: Set attention implementation to 'eager' to avoid SDPA issues + if hasattr(chatterbox_model, 't3') and hasattr(chatterbox_model.t3, 'tfmr'): + try: + chatterbox_model.t3.tfmr.config._attn_implementation = 'eager' + logger.info("Set attention implementation to 'eager' for MPS compatibility") + except Exception as e: + logger.warning(f"Could not set attention implementation: {e}") + + logger.info( + f"Successfully loaded Multilingual TTS model on {model_device}. Supports 23 languages including Hindi." + ) + finally: + # Restore original torch.load + torch.load = original_torch_load + else: + chatterbox_model = ChatterboxTTS.from_pretrained(device=model_device) + logger.info( + f"Successfully loaded English-only TTS model on {model_device}." + ) except Exception as e_hf: logger.error( - f"Failed to load model using from_pretrained (expected from '{model_repo_id_config}' or library default): {e_hf}", + f"Failed to load {'multilingual' if use_multilingual_model else 'English-only'} model: {e_hf}", exc_info=True, ) chatterbox_model = None @@ -214,6 +259,7 @@ def synthesize( exaggeration: float = 0.5, cfg_weight: float = 0.5, seed: int = 0, + language_id: Optional[str] = None, ) -> Tuple[Optional[torch.Tensor], Optional[int]]: """ Synthesizes audio from text using the loaded TTS model. @@ -226,12 +272,14 @@ def synthesize( cfg_weight: Classifier-Free Guidance weight. seed: Random seed for generation. If 0, default randomness is used. If non-zero, a global seed is set for reproducibility. + language_id: Language code for multilingual model (e.g., 'hi' for Hindi, 'en' for English). + Only used with multilingual model. If None, defaults to config language. Returns: A tuple containing the audio waveform (torch.Tensor) and the sample rate (int), or (None, None) if synthesis fails. """ - global chatterbox_model + global chatterbox_model, use_multilingual_model if not MODEL_LOADED or chatterbox_model is None: logger.error("TTS model is not loaded. Cannot synthesize audio.") @@ -249,19 +297,40 @@ def synthesize( logger.debug( f"Synthesizing with params: audio_prompt='{audio_prompt_path}', temp={temperature}, " - f"exag={exaggeration}, cfg_weight={cfg_weight}, seed_applied_globally_if_nonzero={seed}" + f"exag={exaggeration}, cfg_weight={cfg_weight}, seed_applied_globally_if_nonzero={seed}, " + f"language_id={language_id}" ) # Call the core model's generate method - wav_tensor = chatterbox_model.generate( - text=text, - audio_prompt_path=audio_prompt_path, - temperature=temperature, - exaggeration=exaggeration, - cfg_weight=cfg_weight, - ) + # For multilingual model, include language_id parameter if available + if use_multilingual_model and MULTILINGUAL_AVAILABLE and isinstance(chatterbox_model, ChatterboxMultilingualTTS): + # Use provided language_id or default from config + effective_language = language_id or config_manager.get_string("generation_defaults.language", "en") + logger.info(f"Generating speech for language: {effective_language}") + wav_tensor = chatterbox_model.generate( + text=text, + audio_prompt_path=audio_prompt_path, + temperature=temperature, + exaggeration=exaggeration, + cfg_weight=cfg_weight, + language_id=effective_language, + ) + else: + # English-only model doesn't use language_id parameter + if language_id and language_id != "en": + logger.warning( + f"Language '{language_id}' requested but multilingual model not available. " + "Generating in English. Upgrade chatterbox-tts for multilingual support." + ) + wav_tensor = chatterbox_model.generate( + text=text, + audio_prompt_path=audio_prompt_path, + temperature=temperature, + exaggeration=exaggeration, + cfg_weight=cfg_weight, + ) - # The ChatterboxTTS.generate method already returns a CPU tensor. + # The model's generate method already returns a CPU tensor. return wav_tensor, chatterbox_model.sr except Exception as e: diff --git a/main.py b/main.py new file mode 100644 index 0000000..21ed8de --- /dev/null +++ b/main.py @@ -0,0 +1,6 @@ +def main(): + print("Hello from chatterbox-tts-server!") + + +if __name__ == "__main__": + main() diff --git a/models.py b/models.py index 0bef8d6..8e06888 100644 --- a/models.py +++ b/models.py @@ -16,9 +16,9 @@ class GenerationParams(BaseModel): ) exaggeration: Optional[float] = Field( None, - ge=0.25, # Based on Chatterbox Gradio app - le=2.0, # Based on Chatterbox Gradio app - description="Controls expressiveness/exaggeration. (Range: 0.25-2.0)", + ge=0.5, # Based on Chatterbox tutorial notebook + le=3.0, # Based on Chatterbox tutorial notebook + description="Controls expressiveness/exaggeration. Lower values are more subdued/calm, higher values are more dramatic/energetic. (Range: 0.5-3.0)", ) cfg_weight: Optional[float] = Field( None, diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..7b92d81 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,7 @@ +[project] +name = "chatterbox-tts-server" +version = "0.1.0" +description = "Add your description here" +readme = "README.md" +requires-python = ">=3.10" +dependencies = [] diff --git a/requirements.txt b/requirements.txt index 700d35c..16342e2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -23,7 +23,7 @@ fastapi # Modern async web framework uvicorn[standard] # ASGI server with performance extras # --- Scientific Computing & ML Libraries --- -numpy==1.26.4 # Fundamental numerical computing +numpy # Fundamental numerical computing librosa # Advanced audio/music analysis safetensors # Safe tensor serialization format descript-audio-codec # Audio codec for ML applications diff --git a/run.bash b/run.bash new file mode 100755 index 0000000..9e5f3e0 --- /dev/null +++ b/run.bash @@ -0,0 +1 @@ +source .venv/bin/activate && python server.py diff --git a/server.py b/server.py index dbe3a04..69e6ae2 100644 --- a/server.py +++ b/server.py @@ -77,6 +77,7 @@ class OpenAISpeechRequest(BaseModel): response_format: Literal["wav", "opus", "mp3"] = "wav" # Add "mp3" speed: float = 1.0 seed: Optional[int] = None + language: Optional[str] = None # Added language support for multilingual # --- Logging Configuration --- @@ -758,6 +759,11 @@ async def custom_tts_endpoint( seed=( request.seed if request.seed is not None else get_gen_default_seed() ), + language_id=( + request.language + if request.language is not None + else get_gen_default_language() + ), ) perf_monitor.record(f"Engine synthesized chunk {i+1}") @@ -936,6 +942,11 @@ async def openai_speech_endpoint(request: OpenAISpeechRequest): exaggeration=get_gen_default_exaggeration(), cfg_weight=get_gen_default_cfg_weight(), seed=seed_to_use, + language_id=( + request.language + if request.language is not None + else get_gen_default_language() + ), ) if audio_tensor is None or sr is None: diff --git a/ui/index.html b/ui/index.html index 07ecb87..615336c 100644 --- a/ui/index.html +++ b/ui/index.html @@ -1,9 +1,10 @@ - + + - - - + + + Chatterbox TTS Server - + - + - +
- - -
- - - -
- + +
+
+
-
- -
- - -
-
+
+ +
+

+ Generate Speech +

-
- -
- - - - -
-
+
+ +

+ Enter the text you want to convert to speech. For + audiobooks, you can paste long chapters. +

+
+ +
+ 0 Characters +
+
+
- +
+ + + +
+ -
- -
-

- Loading presets...

-
-
+
+ +
+ + +
+
-
-
- - Generation - Parameters - - - - - - -
-
- - -
-
- - -
-
- - -
-
- - -
-
- -
- -
-

- Integer for reproducible results. Some engines use 0 or -1 for random. -

-
-
- - -
-
- - -

- MP3 is recommended for smaller file sizes (e.g., audiobooks). -

-
-
- - -
-
-
-
+
+ +
+ + + + +
+
-
-
- - Server - Configuration - - - - - - -
-

- These settings are loaded from config.yaml - via an API call. - Restart the server to apply changes to - Host, Port, Model, or Path settings if modified here or directly in the - file. -

-
-
-
-
-
-
-
-
-
-
-
-
-
+ -
- - - -
-
-
-
-
-
+
+ +
+

+ Loading presets... +

+
+
- -
+
- - + + + -