From e18d985a2e0516cf83f9b8d25efda65e95b1126a Mon Sep 17 00:00:00 2001 From: Alok Kumar Date: Tue, 11 Nov 2025 12:18:06 +0530 Subject: [PATCH 1/2] Add multilingual TTS support with ChatterboxMultilingualTTS - Upgrade to ChatterboxMultilingualTTS from GitHub (supports 23 languages) - Add MPS device compatibility fixes for Apple Silicon Macs - Fix attention implementation issues with SDPA on MPS devices - Update config to enable multilingual support with English as default language - Enhance UI with model selection and language controls - Add graceful fallback for missing multilingual support - Update server API to handle language_id parameter - Fix torch.load map_location for non-CUDA devices - Add comprehensive documentation for multilingual setup Now working with multilingual model on Mac, supporting 23 languages including Hindi, Arabic, Chinese, French, German, etc. Default language set to English while maintaining full multilingual capabilities. --- .python-version | 1 + UI_CHANGES_SUMMARY.md | 151 +++ UPGRADE_FOR_HINDI.md | 224 ++++ config.py | 5 +- config.yaml | 52 +- engine.py | 127 ++- main.py | 6 + models.py | 6 +- pyproject.toml | 7 + requirements.txt | 2 +- run.bash | 1 + server.py | 11 + ui/index.html | 1436 ++++++++++++++++-------- ui/script.js | 2458 +++++++++++++++++++++++++---------------- uv.lock | 8 + 15 files changed, 3050 insertions(+), 1445 deletions(-) create mode 100644 .python-version create mode 100644 UI_CHANGES_SUMMARY.md create mode 100644 UPGRADE_FOR_HINDI.md create mode 100644 main.py create mode 100644 pyproject.toml create mode 100755 run.bash create mode 100644 uv.lock diff --git a/.python-version b/.python-version new file mode 100644 index 0000000..c8cfe39 --- /dev/null +++ b/.python-version @@ -0,0 +1 @@ +3.10 diff --git a/UI_CHANGES_SUMMARY.md b/UI_CHANGES_SUMMARY.md new file mode 100644 index 0000000..531e7b9 --- /dev/null +++ b/UI_CHANGES_SUMMARY.md @@ -0,0 +1,151 @@ +# UI Changes for Multilingual Support + +## Summary +Added UI controls to allow users to select between the multilingual and English-only TTS models, along with improved language selection and status indicators. + +## Changes Made + +### 1. Navigation Bar (index.html) +**Added Model Status Badge** - Shows which model is currently loaded +- Location: Next to the title in the navigation bar +- Displays: "🌐 Multilingual (23 Languages)" or "🇬🇧 English Only" +- Updates automatically based on server configuration +- Provides visual feedback about the active model + +### 2. Generation Parameters Section (index.html) +**Added Model Type Selector** - Dropdown to choose TTS model +- Location: Just before the Language selector in the Generation Parameters +- Options: + - "Multilingual (23 Languages)" - Supports 23 languages including Hindi + - "English Only" - Smaller model, English only +- Note: Warns users that server restart is required to apply changes + +**Updated Language Selector** +- Changed default selection from "en" (English) to "hi" (Hindi) +- Added language codes in parentheses for clarity (e.g., "Hindi (hi)") +- Updated help text to be more informative +- Now properly syncs with the config.yaml default language setting + +### 3. JavaScript Updates (script.js) + +#### Model Type Handling +- Added `modelTypeSelect` element reference +- Loads current model type from server config on page load +- Sets dropdown value based on `config.model.use_multilingual` + +#### Save Generation Parameters +- Extended to save both generation parameters AND model type selection +- Saves `model.use_multilingual` boolean value to server config +- Shows warning notification when model type changes +- Detects if restart is needed and notifies user + +#### Model Status Badge Updates +- Badge text and icon update on page load based on config +- Shows multilingual status with globe emoji (🌐) or English with flag (🇬🇧) +- Tooltip provides detailed information about loaded model + +#### Change Detection & Warnings +- Detects when user changes model type from current config +- Shows persistent notification reminding user to: + 1. Click "Save Generation Parameters" + 2. Restart the server for changes to take effect +- Prevents confusion about why model didn't change + +## User Workflow + +### To Switch Models: +1. Open the UI in browser +2. Look at the Generation Parameters section +3. Find "TTS Model" dropdown +4. Select desired model: + - "Multilingual (23 Languages)" - for Hindi and other languages + - "English Only" - for English-only use case +5. Click "Save Generation Parameters" button +6. Notice the warning about server restart +7. Click "Restart Server" button (or manually restart) +8. Refresh the page +9. Verify the model status badge shows the new model + +### To Use Hindi (Default): +1. Model is already set to Multilingual +2. Language is already set to Hindi (hi) +3. Simply enter Hindi text and generate + +### To Use Other Languages: +1. Ensure Model Type is "Multilingual" +2. Select desired language from Language dropdown +3. Enter text in that language +4. Generate speech + +## Visual Indicators + +### Model Status Badge (Top Navigation) +``` +Current State Badge Display +------------------ ------------------------------- +Multilingual loaded -> "🌐 Multilingual (23 Languages)" +English-only loaded -> "🇬🇧 English Only" +``` + +### Model Type Selector (Form) +``` +Config Value Dropdown Shows +-------------------------------- --------------------------- +use_multilingual: true -> "Multilingual (23 Languages)" +use_multilingual: false -> "English Only" +``` + +### Language Selector (Form) +- Shows all 23 supported languages +- Hindi (hi) is selected by default +- Language codes shown for clarity + +## Benefits + +1. **User-Friendly**: Clear visual feedback about which model is loaded +2. **Flexible**: Easy switching between multilingual and English-only models +3. **Informative**: Warnings and tooltips guide users through the process +4. **Consistent**: UI state syncs with server configuration +5. **Safe**: Clear warnings about server restart requirements + +## Technical Details + +### Config Values Saved +```json +{ + "model": { + "use_multilingual": true // or false + }, + "generation_defaults": { + "language": "hi", // or any of 23 supported codes + "temperature": 0.8, + "exaggeration": 0.5, + // ... other params + } +} +``` + +### Supported Language Codes +ar, da, de, el, en, es, fi, fr, he, **hi**, it, ja, ko, ms, nl, no, pl, pt, ru, sv, sw, tr, zh + +### Files Modified +1. `ui/index.html` - Added model selector, updated language selector, added status badge +2. `ui/script.js` - Added model handling logic, status updates, change detection +3. `config.yaml` - Set default to multilingual model and Hindi language +4. `config.py` - Added default multilingual setting to DEFAULT_CONFIG +5. `engine.py` - Added multilingual model support +6. `server.py` - Added language_id parameter handling + +## Testing Checklist + +- [ ] Model status badge shows correct model on page load +- [ ] Model type selector reflects current config +- [ ] Language selector defaults to Hindi +- [ ] Changing model type shows notification +- [ ] Save button updates config.yaml correctly +- [ ] Restart button triggers server restart +- [ ] After restart, new model is loaded +- [ ] Badge updates after model change +- [ ] Hindi text generates proper speech (not noise) +- [ ] Other languages work correctly +- [ ] English still works when English-only model selected diff --git a/UPGRADE_FOR_HINDI.md b/UPGRADE_FOR_HINDI.md new file mode 100644 index 0000000..b40200b --- /dev/null +++ b/UPGRADE_FOR_HINDI.md @@ -0,0 +1,224 @@ +# Upgrade Guide: Enable Hindi & Multilingual Support + +## Current Status + +Your Chatterbox TTS server is currently running with: +- **chatterbox-tts version**: 0.1.2 (English-only) +- **Model**: English-only TTS model +- **Languages**: English only + +## To Enable Hindi & 22 Other Languages + +The code has been updated to support multilingual TTS, but you need to upgrade the `chatterbox-tts` package to access the multilingual model. + +### Step 1: Upgrade chatterbox-tts Package + +```bash +# Stop the server first (Ctrl+C if running) + +# Activate your virtual environment +source .venv/bin/activate + +# Upgrade to the latest version +pip install --upgrade chatterbox-tts + +# Or install a specific version if available +pip install chatterbox-tts>=0.1.4 +``` + +### Step 2: Verify the Upgrade + +```bash +# Check the installed version +pip show chatterbox-tts + +# Verify multilingual support is available +python -c "from chatterbox import ChatterboxMultilingualTTS; print('Multilingual support: Available')" || echo "Multilingual support: Not yet available" +``` + +### Step 3: Enable Multilingual in Configuration + +Edit `config.yaml`: + +```yaml +model: + repo_id: ResembleAI/chatterbox + use_multilingual: true # Change from false to true + +generation_defaults: + temperature: 0.8 + exaggeration: 0.5 + cfg_weight: 0.5 + seed: 0 + speed_factor: 1.0 + language: hi # Change from 'en' to 'hi' for Hindi +``` + +### Step 4: Restart the Server + +```bash +./run.bash +``` + +Or manually: + +```bash +source .venv/bin/activate +python server.py +``` + +### Step 5: Verify Hindi Support + +Check the server logs for: +``` +Successfully loaded Multilingual TTS model on mps. Supports 23 languages including Hindi. +``` + +Open the web UI and you should see: +- Model badge showing: "🌐 Multilingual (23 Languages)" +- Language selector with Hindi (hi) as default + +## Current Fallback Behavior + +The code has been designed to gracefully handle the missing multilingual model: + +1. **If multilingual is requested but not available**: + - Server logs a warning + - Automatically falls back to English-only model + - Server continues to run normally + +2. **If you try to generate Hindi audio**: + - Server logs a warning: "Language 'hi' requested but multilingual model not available" + - Generates English audio instead + - No errors or crashes + +## Supported Languages (After Upgrade) + +Once upgraded, your server will support these 23 languages: + +| Code | Language | Code | Language | Code | Language | +|------|----------|------|----------|------|----------| +| ar | Arabic | da | Danish | de | German | +| el | Greek | en | English | es | Spanish | +| fi | Finnish | fr | French | he | Hebrew | +| **hi** | **Hindi** | it | Italian | ja | Japanese | +| ko | Korean | ms | Malay | nl | Dutch | +| no | Norwegian | pl | Polish | pt | Portuguese | +| ru | Russian | sv | Swedish | sw | Swahili | +| tr | Turkish | zh | Chinese | | | + +## Troubleshooting + +### Issue: Multilingual import still fails after upgrade + +**Solution**: The multilingual model might be in a different package or version: + +```bash +# Check if there's a separate multilingual package +pip search chatterbox-multilingual + +# Or check the Chatterbox GitHub for latest installation instructions +# https://github.com/resemble-ai/chatterbox +``` + +### Issue: Model download fails + +**Solution**: The multilingual model is larger (~2-3GB). Ensure you have: +- Sufficient disk space +- Stable internet connection +- Access to Hugging Face (not blocked by firewall) + +### Issue: Server shows "Multilingual model requested but not available" + +This means the package doesn't have the multilingual class yet. Check: + +```bash +# Verify the package contents +python -c "import chatterbox; print(dir(chatterbox))" + +# Look for ChatterboxMultilingualTTS in the output +``` + +### Issue: Still generating English audio for Hindi text + +Possible causes: +1. Multilingual model not loaded (check logs) +2. Config still has `use_multilingual: false` +3. Model badge still shows "English Only" + +**Solution**: Follow all upgrade steps again and restart server. + +## Alternative: Use Latest GitHub Version + +If PyPI doesn't have the latest multilingual version yet: + +```bash +# Install directly from GitHub +pip uninstall chatterbox-tts -y +pip install git+https://github.com/resemble-ai/chatterbox.git + +# Or clone and install locally +git clone https://github.com/resemble-ai/chatterbox.git +cd chatterbox +pip install -e . +``` + +## Rollback to English-Only + +If you prefer to use only English: + +1. Edit `config.yaml`: + ```yaml + model: + use_multilingual: false + + generation_defaults: + language: en + ``` + +2. Restart server + +The server will use the smaller, faster English-only model. + +## Code Changes Summary + +The following changes were made to support gradual migration: + +### engine.py +- Added conditional import for multilingual model +- Falls back gracefully if multilingual not available +- Warns user to upgrade when multilingual is requested +- Uses English-only model as fallback + +### config.yaml +- Added `model.use_multilingual` setting (currently `false`) +- Language default set to `en` (will be `hi` after upgrade) + +### UI +- Model selector dropdown to switch between models +- Status badge showing currently loaded model +- Language selector for all 23 languages +- Automatic warnings when changes require restart + +## Benefits After Upgrade + +✅ **Full Hindi Support** - Generate natural-sounding Hindi speech +✅ **22 Additional Languages** - Support for multiple languages +✅ **Better Quality** - Improved voice quality for non-English languages +✅ **No Accent Issues** - Native language synthesis without English accent +✅ **UI Ready** - All UI controls already in place + +## Next Steps + +1. Upgrade `chatterbox-tts` package +2. Update `config.yaml` settings +3. Restart server +4. Test Hindi generation +5. Enjoy multilingual TTS! 🎉 + +--- + +**Need Help?** +- Check server logs for detailed error messages +- Review [Chatterbox GitHub](https://github.com/resemble-ai/chatterbox) +- Check [Chatterbox Multilingual Demo](https://huggingface.co/spaces/ResembleAI/Chatterbox-Multilingual-TTS) diff --git a/config.py b/config.py index 2fdea6f..91f9ac8 100644 --- a/config.py +++ b/config.py @@ -50,6 +50,7 @@ }, "model": { # Added section for model source configuration "repo_id": "ResembleAI/chatterbox", # Default Hugging Face repository ID for the model + "use_multilingual": True, # Use multilingual model for 23 languages support (default: True) }, "tts_engine": { "device": "auto", # TTS processing device: 'auto', 'cuda', 'mps', or 'cpu'. @@ -72,11 +73,11 @@ }, "generation_defaults": { # Default parameters for TTS audio generation. "temperature": 0.8, # Controls randomness: lower is more deterministic. - "exaggeration": 0.5, # Controls expressiveness or exaggeration in speech. + "exaggeration": 0.5, # Controls expressiveness or exaggeration in speech. Range: 0.5 (subdued) to 3.0 (very dramatic). "cfg_weight": 0.5, # Classifier-Free Guidance weight, influences adherence to prompt/style. "seed": 0, # Random seed for generation. 0 often means random or engine default. "speed_factor": 1.0, # Controls the speed of the generated speech. - "language": "en", # Default language for TTS. + "language": "en", # Default language for TTS. Supported: ar (Arabic), da (Danish), de (German), el (Greek), en (English), es (Spanish), fi (Finnish), fr (French), he (Hebrew), hi (Hindi), it (Italian), ja (Japanese), ko (Korean), ms (Malay), nl (Dutch), no (Norwegian), pl (Polish), pt (Portuguese), ru (Russian), sv (Swedish), sw (Swahili), tr (Turkish), zh (Chinese). Note: Quality may vary by language. }, "audio_output": { # Settings related to the format of generated audio. "format": "wav", # Output audio format (e.g., 'wav', 'mp3'). diff --git a/config.yaml b/config.yaml index f610286..30f827f 100644 --- a/config.yaml +++ b/config.yaml @@ -1,6 +1,6 @@ server: host: 0.0.0.0 - port: 8004 + port: 8000 use_ngrok: false use_auth: false auth_username: user @@ -10,44 +10,60 @@ server: log_file_backup_count: 5 model: repo_id: ResembleAI/chatterbox + use_multilingual: true tts_engine: - device: cuda + device: mps predefined_voices_path: voices reference_audio_path: reference_audio - default_voice_id: Emily.wav + default_voice_id: default_sample.wav paths: model_cache: model_cache output: outputs generation_defaults: temperature: 0.8 - exaggeration: 1.3 + exaggeration: 0.5 cfg_weight: 0.5 seed: 0 speed_factor: 1.0 - language: en + language: hi audio_output: format: wav sample_rate: 24000 max_reference_duration_sec: 30 ui_state: - last_text: 'Are you tired of slow, unreliable connections? Upgrade today to Quantum - Fiber, the fastest internet in the galaxy! Experience seamless streaming, lag-free - gaming, and instant downloads. Call now and get your first three months half price! - Don''t wait, this offer won''t last forever! - - ' - last_voice_mode: predefined - last_predefined_voice: Emily.wav - last_reference_file: Gianna.wav - last_seed: 3000 - last_chunk_size: 240 + last_text: + "\u092E\u0948\u0902 \u0938\u093E\u0915\u094D\u0937\u0940 \u0906\u0928\ + \u0902\u0926 \u0939\u0942\u0901\u0964 \u092E\u0948\u0902 \u092C\u093F\u0939\u093E\ + \u0930 \u0915\u0947 \u092A\u091F\u0928\u093E \u092E\u0947\u0902 \u092A\u0948\u0926\ + \u093E \u0939\u0941\u0908 \u0925\u0940, \u0914\u0930 \u0905\u092C \u092A\u0941\ + \u0923\u0947 \u092E\u0947\u0902 \u0930\u0939 \u0930\u0939\u0940 \u0939\u0942\u0901\ + \u0964\n\n\u092E\u0948\u0902 \u092C\u0939\u0941\u0924 \u0906\u0932\u0938\u0940\ + \ \u0932\u0921\u093C\u0915\u0940 \u0939\u0942\u0901\u0964 \u092E\u0948\u0902 \u0938\ + \u0941\u092C\u0939 \u092C\u0939\u0941\u0924 \u0926\u0947\u0930 \u0938\u0947 \u0909\ + \u0920\u0924\u0940 \u0939\u0942\u0901, \u092E\u0941\u091D\u0947 \u0917\u092A\u0936\ + \u092A \u0915\u0930\u0928\u093E \u092C\u0939\u0941\u0924 \u092A\u0938\u0902\u0926\ + \ \u0939\u0948\u0964 \u092E\u0948\u0902 \u090F\u0915 \u092E\u0930\u094D\u0926\ + \ \u092C\u0928\u0928\u093E \u091A\u093E\u0939\u0924\u0940 \u0939\u0942\u0901\u0964\ + \ \u092E\u0941\u091D\u0947 \u0913\u0936\u094B \u0915\u0947 \u0935\u093F\u091A\u093E\ + \u0930 \u092C\u0939\u0941\u0924 \u092A\u0938\u0902\u0926 \u0939\u0948\u0902, \u091C\ + \u093F\u0928\u094D\u0939\u094B\u0902\u0928\u0947 \u092E\u0941\u091D\u0947 \u0938\ + \u093F\u0916\u093E\u092F\u093E \u0915\u093F \u092E\u0941\u0936\u094D\u0915\u093F\ + \u0932 \u0938\u0935\u093E\u0932 \u092A\u0942\u091B\u0928\u093E \u0915\u093F\u0924\ + \u0928\u093E \u091C\u093C\u0930\u0942\u0930\u0940 \u0939\u0948\u0964 \u0914\u0930\ + \ \u092E\u0947\u0930\u093E \u0926\u093F\u092E\u093E\u0917 \u0918\u0941\u091F\u0928\ + \u0947 \u092E\u0947\u0902 \u0939\u0948\u0964" + last_voice_mode: clone + last_predefined_voice: none + last_reference_file: recn.wav + last_seed: 2024 + last_chunk_size: 250 last_split_text_enabled: true hide_chunk_warning: false - hide_generation_warning: false + hide_generation_warning: true theme: light ui: title: Chatterbox TTS Server show_language_select: true - max_predefined_voices_in_dropdown: 50 + max_predefined_voices_in_dropdown: 20 debug: save_intermediate_audio: false diff --git a/engine.py b/engine.py index e42426e..b764a03 100644 --- a/engine.py +++ b/engine.py @@ -5,10 +5,20 @@ import random import numpy as np import torch -from typing import Optional, Tuple +from typing import Optional, Tuple, Union from pathlib import Path from chatterbox.tts import ChatterboxTTS # Main TTS engine class + +# Try to import multilingual model if available (newer versions) +try: + from chatterbox.mtl_tts import ChatterboxMultilingualTTS + MULTILINGUAL_AVAILABLE = True +except ImportError: + ChatterboxMultilingualTTS = None # type: ignore + MULTILINGUAL_AVAILABLE = False + logging.warning("Multilingual TTS model not available. Please upgrade chatterbox-tts for multilingual support.") + from chatterbox.models.s3gen.const import ( S3GEN_SR, ) # Default sample rate from the engine @@ -19,11 +29,12 @@ logger = logging.getLogger(__name__) # --- Global Module Variables --- -chatterbox_model: Optional[ChatterboxTTS] = None +chatterbox_model: Optional[Union[ChatterboxTTS, 'ChatterboxMultilingualTTS']] = None MODEL_LOADED: bool = False model_device: Optional[str] = ( - None # Stores the resolved device string ('cuda' or 'cpu') + None # Stores the resolved device string ('cuda', 'mps', or 'cpu') ) +use_multilingual_model: bool = True # Default to multilingual for broader language support def set_seed(seed_value: int): @@ -87,12 +98,13 @@ def load_model() -> bool: Loads the TTS model. This version directly attempts to load from the Hugging Face repository (or its cache) using `from_pretrained`, bypassing the local `paths.model_cache` directory. + Automatically uses the multilingual model for broader language support. Updates global variables `chatterbox_model`, `MODEL_LOADED`, and `model_device`. Returns: bool: True if the model was loaded successfully, False otherwise. """ - global chatterbox_model, MODEL_LOADED, model_device + global chatterbox_model, MODEL_LOADED, model_device, use_multilingual_model if MODEL_LOADED: logger.info("TTS model is already loaded.") @@ -157,27 +169,60 @@ def load_model() -> bool: model_device = resolved_device_str logger.info(f"Final device selection: {model_device}") - # Get configured model_repo_id for logging and context, - # though from_pretrained might use its own internal default if not overridden. - model_repo_id_config = config_manager.get_string( - "model.repo_id", "ResembleAI/chatterbox" - ) - + # Check if multilingual model should be used (default: True for broader language support) + use_multilingual_model = config_manager.get_bool("model.use_multilingual", True) + + # Check if multilingual model is actually available + if use_multilingual_model and not MULTILINGUAL_AVAILABLE: + logger.warning( + "Multilingual model requested but not available in current chatterbox-tts version. " + "Using English-only model. To enable multilingual support, upgrade chatterbox-tts: " + "pip install --upgrade chatterbox-tts" + ) + use_multilingual_model = False + logger.info( - f"Attempting to load model directly using from_pretrained (expected from Hugging Face repository: {model_repo_id_config} or library default)." + f"Attempting to load {'multilingual' if use_multilingual_model else 'English-only'} model using from_pretrained." ) try: # Directly use from_pretrained. This will utilize the standard Hugging Face cache. - # The ChatterboxTTS.from_pretrained method handles downloading if the model is not in the cache. - chatterbox_model = ChatterboxTTS.from_pretrained(device=model_device) - # The actual repo ID used by from_pretrained is often internal to the library, - # but logging the configured one provides user context. - logger.info( - f"Successfully loaded TTS model using from_pretrained on {model_device} (expected from '{model_repo_id_config}' or library default)." - ) + # The model's from_pretrained method handles downloading if the model is not in the cache. + if use_multilingual_model and MULTILINGUAL_AVAILABLE: + # Workaround for MPS/CPU: Patch torch.load to use map_location for non-CUDA devices + original_torch_load = torch.load + if model_device != "cuda": + device_obj = torch.device(model_device) + def patched_torch_load(f, *args, **kwargs): + if 'map_location' not in kwargs: + kwargs['map_location'] = device_obj + return original_torch_load(f, *args, **kwargs) + torch.load = patched_torch_load + + try: + chatterbox_model = ChatterboxMultilingualTTS.from_pretrained(device=model_device) + + # Fix for MPS: Set attention implementation to 'eager' to avoid SDPA issues + if hasattr(chatterbox_model, 't3') and hasattr(chatterbox_model.t3, 'tfmr'): + try: + chatterbox_model.t3.tfmr.config._attn_implementation = 'eager' + logger.info("Set attention implementation to 'eager' for MPS compatibility") + except Exception as e: + logger.warning(f"Could not set attention implementation: {e}") + + logger.info( + f"Successfully loaded Multilingual TTS model on {model_device}. Supports 23 languages including Hindi." + ) + finally: + # Restore original torch.load + torch.load = original_torch_load + else: + chatterbox_model = ChatterboxTTS.from_pretrained(device=model_device) + logger.info( + f"Successfully loaded English-only TTS model on {model_device}." + ) except Exception as e_hf: logger.error( - f"Failed to load model using from_pretrained (expected from '{model_repo_id_config}' or library default): {e_hf}", + f"Failed to load {'multilingual' if use_multilingual_model else 'English-only'} model: {e_hf}", exc_info=True, ) chatterbox_model = None @@ -214,6 +259,7 @@ def synthesize( exaggeration: float = 0.5, cfg_weight: float = 0.5, seed: int = 0, + language_id: Optional[str] = None, ) -> Tuple[Optional[torch.Tensor], Optional[int]]: """ Synthesizes audio from text using the loaded TTS model. @@ -226,12 +272,14 @@ def synthesize( cfg_weight: Classifier-Free Guidance weight. seed: Random seed for generation. If 0, default randomness is used. If non-zero, a global seed is set for reproducibility. + language_id: Language code for multilingual model (e.g., 'hi' for Hindi, 'en' for English). + Only used with multilingual model. If None, defaults to config language. Returns: A tuple containing the audio waveform (torch.Tensor) and the sample rate (int), or (None, None) if synthesis fails. """ - global chatterbox_model + global chatterbox_model, use_multilingual_model if not MODEL_LOADED or chatterbox_model is None: logger.error("TTS model is not loaded. Cannot synthesize audio.") @@ -249,19 +297,40 @@ def synthesize( logger.debug( f"Synthesizing with params: audio_prompt='{audio_prompt_path}', temp={temperature}, " - f"exag={exaggeration}, cfg_weight={cfg_weight}, seed_applied_globally_if_nonzero={seed}" + f"exag={exaggeration}, cfg_weight={cfg_weight}, seed_applied_globally_if_nonzero={seed}, " + f"language_id={language_id}" ) # Call the core model's generate method - wav_tensor = chatterbox_model.generate( - text=text, - audio_prompt_path=audio_prompt_path, - temperature=temperature, - exaggeration=exaggeration, - cfg_weight=cfg_weight, - ) + # For multilingual model, include language_id parameter if available + if use_multilingual_model and MULTILINGUAL_AVAILABLE and isinstance(chatterbox_model, ChatterboxMultilingualTTS): + # Use provided language_id or default from config + effective_language = language_id or config_manager.get_string("generation_defaults.language", "en") + logger.info(f"Generating speech for language: {effective_language}") + wav_tensor = chatterbox_model.generate( + text=text, + audio_prompt_path=audio_prompt_path, + temperature=temperature, + exaggeration=exaggeration, + cfg_weight=cfg_weight, + language_id=effective_language, + ) + else: + # English-only model doesn't use language_id parameter + if language_id and language_id != "en": + logger.warning( + f"Language '{language_id}' requested but multilingual model not available. " + "Generating in English. Upgrade chatterbox-tts for multilingual support." + ) + wav_tensor = chatterbox_model.generate( + text=text, + audio_prompt_path=audio_prompt_path, + temperature=temperature, + exaggeration=exaggeration, + cfg_weight=cfg_weight, + ) - # The ChatterboxTTS.generate method already returns a CPU tensor. + # The model's generate method already returns a CPU tensor. return wav_tensor, chatterbox_model.sr except Exception as e: diff --git a/main.py b/main.py new file mode 100644 index 0000000..21ed8de --- /dev/null +++ b/main.py @@ -0,0 +1,6 @@ +def main(): + print("Hello from chatterbox-tts-server!") + + +if __name__ == "__main__": + main() diff --git a/models.py b/models.py index 0bef8d6..8e06888 100644 --- a/models.py +++ b/models.py @@ -16,9 +16,9 @@ class GenerationParams(BaseModel): ) exaggeration: Optional[float] = Field( None, - ge=0.25, # Based on Chatterbox Gradio app - le=2.0, # Based on Chatterbox Gradio app - description="Controls expressiveness/exaggeration. (Range: 0.25-2.0)", + ge=0.5, # Based on Chatterbox tutorial notebook + le=3.0, # Based on Chatterbox tutorial notebook + description="Controls expressiveness/exaggeration. Lower values are more subdued/calm, higher values are more dramatic/energetic. (Range: 0.5-3.0)", ) cfg_weight: Optional[float] = Field( None, diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..7b92d81 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,7 @@ +[project] +name = "chatterbox-tts-server" +version = "0.1.0" +description = "Add your description here" +readme = "README.md" +requires-python = ">=3.10" +dependencies = [] diff --git a/requirements.txt b/requirements.txt index 700d35c..16342e2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -23,7 +23,7 @@ fastapi # Modern async web framework uvicorn[standard] # ASGI server with performance extras # --- Scientific Computing & ML Libraries --- -numpy==1.26.4 # Fundamental numerical computing +numpy # Fundamental numerical computing librosa # Advanced audio/music analysis safetensors # Safe tensor serialization format descript-audio-codec # Audio codec for ML applications diff --git a/run.bash b/run.bash new file mode 100755 index 0000000..9e5f3e0 --- /dev/null +++ b/run.bash @@ -0,0 +1 @@ +source .venv/bin/activate && python server.py diff --git a/server.py b/server.py index dbe3a04..69e6ae2 100644 --- a/server.py +++ b/server.py @@ -77,6 +77,7 @@ class OpenAISpeechRequest(BaseModel): response_format: Literal["wav", "opus", "mp3"] = "wav" # Add "mp3" speed: float = 1.0 seed: Optional[int] = None + language: Optional[str] = None # Added language support for multilingual # --- Logging Configuration --- @@ -758,6 +759,11 @@ async def custom_tts_endpoint( seed=( request.seed if request.seed is not None else get_gen_default_seed() ), + language_id=( + request.language + if request.language is not None + else get_gen_default_language() + ), ) perf_monitor.record(f"Engine synthesized chunk {i+1}") @@ -936,6 +942,11 @@ async def openai_speech_endpoint(request: OpenAISpeechRequest): exaggeration=get_gen_default_exaggeration(), cfg_weight=get_gen_default_cfg_weight(), seed=seed_to_use, + language_id=( + request.language + if request.language is not None + else get_gen_default_language() + ), ) if audio_tensor is None or sr is None: diff --git a/ui/index.html b/ui/index.html index 07ecb87..615336c 100644 --- a/ui/index.html +++ b/ui/index.html @@ -1,9 +1,10 @@ - + + - - - + + + Chatterbox TTS Server - + - + - +
- - -
- - - -
- + +
+
+
-
- -
- - -
-
+
+ +
+

+ Generate Speech +

-
- -
- - - - -
-
+
+ +

+ Enter the text you want to convert to speech. For + audiobooks, you can paste long chapters. +

+
+ +
+ 0 Characters +
+
+
- +
+ + + +
+ -
- -
-

- Loading presets...

-
-
+
+ +
+ + +
+
-
-
- - Generation - Parameters - - - - - - -
-
- - -
-
- - -
-
- - -
-
- - -
-
- -
- -
-

- Integer for reproducible results. Some engines use 0 or -1 for random. -

-
-
- - -
-
- - -

- MP3 is recommended for smaller file sizes (e.g., audiobooks). -

-
-
- - -
-
-
-
+
+ +
+ + + + +
+
-
-
- - Server - Configuration - - - - - - -
-

- These settings are loaded from config.yaml - via an API call. - Restart the server to apply changes to - Host, Port, Model, or Path settings if modified here or directly in the - file. -

-
-
-
-
-
-
-
-
-
-
-
-
-
+ -
- - - -
-
-
-
-
-
+
+ +
+

+ Loading presets... +

+
+
- -
+
- - + + + -