From 26db1ee5d31a68a9c2908f5058959b117b70b6cd Mon Sep 17 00:00:00 2001 From: Christopher Oezbek Date: Tue, 7 Oct 2025 10:29:41 +0200 Subject: [PATCH 1/4] Update project dependencies to support torch 2.8.0 --- pyproject.toml | 48 ++++++------- .../models/speech_tokenizer/kmeans.py | 3 +- .../speech_tokenizer/speech_tokenizer.py | 72 +++++++++++-------- .../models/speech_tokenizer/xlsr_encoder.py | 13 ++-- 4 files changed, 73 insertions(+), 63 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index f0d4442..2dbb78a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,37 +9,37 @@ allow-direct-references = true name = "playdiffusion" version = "0.1.0" description = "Diffusion model for speech inpainting and TTS" -requires-python = "==3.11.*" +requires-python = ">=3.11" license = { text = "Apache-2.0" } dependencies = [ - "torch==2.6.0", - "torchaudio==2.6.0", - "numpy==1.24.3", - "fairseq2==0.4.4", - "nltk==3.9.1", + "torch>=2.8.0", + "torchaudio>=2.8.0", + "numpy>=1.24.3", + "fairseq2>=0.5.2", + "nltk>=3.9.1", "syllables @ git+https://github.com/playht/python-syllables.git", - "jiwer==3.1.0", - "pydantic==2.11.5", - "soundfile==0.13.1", - "boto3==1.38.22", - "tqdm==4.67.1", - "python-decouple==3.8", - "safetensors==0.5.3", - "tokenizers==0.21.1", - "librosa==0.10.1", - "scipy==1.11.4", - "scikit-learn==1.3.2", - "einops==0.8.1", - "torchtune==0.6.1", - "torchao==0.11.0", - "huggingface-hub==0.31.4", - "unidecode==1.4.0", + "jiwer>=3.1.0", + "pydantic>=2.11.5", + "soundfile>=0.13.1", + "boto3>=1.38.22", + "tqdm>=4.67.1", + "python-decouple>=3.8", + "safetensors>=0.6.0", + "tokenizers>=0.21.1", + "librosa>=0.10.1", + "scipy>=1.11.4", + "scikit-learn>=1.3.2", + "einops>=0.8.1", + "torchtune>=0.6.1", + "torchao>=0.11.0", + "huggingface-hub>=0.31.4", + "unidecode>=1.4.0", ] [project.optional-dependencies] demo = [ - "gradio==5.31.0", - "openai==1.82.0", + "gradio>=5.31.0", + "openai>=1.82.0", "openai-whisper>=20230314", "whisper-timestamped>=0.0.11", ] diff --git a/src/playdiffusion/models/speech_tokenizer/kmeans.py b/src/playdiffusion/models/speech_tokenizer/kmeans.py index 8b134c9..06088e4 100644 --- a/src/playdiffusion/models/speech_tokenizer/kmeans.py +++ b/src/playdiffusion/models/speech_tokenizer/kmeans.py @@ -1,11 +1,10 @@ import numpy as np import torch -from fairseq2.typing import DataType, Device from torch import Tensor, nn class KmeansModel(nn.Module): - def __init__(self, km_path: str, device: Device, dtype: DataType): + def __init__(self, km_path: str, device: torch.device, dtype: torch.dtype): super().__init__() km_model = np.load(km_path) centroids_numpy = km_model.transpose() diff --git a/src/playdiffusion/models/speech_tokenizer/speech_tokenizer.py b/src/playdiffusion/models/speech_tokenizer/speech_tokenizer.py index c16f3e4..613b80a 100644 --- a/src/playdiffusion/models/speech_tokenizer/speech_tokenizer.py +++ b/src/playdiffusion/models/speech_tokenizer/speech_tokenizer.py @@ -2,10 +2,8 @@ from typing import List, Optional, Tuple, Union import torch -from fairseq2.data import Collater -from fairseq2.models.sequence import SequenceBatch -from fairseq2.nn.padding import PaddingMask, get_seqs_and_padding_mask -from fairseq2.typing import DataType, Device +from fairseq2.nn import BatchLayout +from torch.nn.utils.rnn import pad_sequence from playdiffusion.models.speech_tokenizer.kmeans import KmeansModel from playdiffusion.models.speech_tokenizer.xlsr_encoder import load_xlsr_encoder @@ -32,8 +30,8 @@ def __init__( self, checkpoint: Union[str, None] = "data/checkpoints/xlsr2_1b_v2_custom.pt", max_layer: Union[int, None] = 35, - device: Optional[Device] = None, - dtype: DataType = torch.float32, + device: Optional[torch.device] = None, + dtype: torch.dtype = torch.float32, strict: bool = False, eval: bool = True, ) -> None: @@ -81,17 +79,20 @@ def dtype(self): return next(self.parameters()).dtype @torch.inference_mode() - def forward(self, batch: SequenceBatch) -> Tuple[torch.Tensor, PaddingMask]: + # The forward signature now accepts the padded sequences and the batch layout + def forward(self, seqs: torch.Tensor, layout: BatchLayout) -> torch.Tensor: """ Minimal re-implementation that assumes we only loaded `max_layer` layers. This is better as it doesn't require the full model to be loaded. - :param batch: - The batch of sequences to process. + :param seqs: + The batch of padded sequences. + :param layout: + The layout of the batch (containing sequence lengths). """ - seqs, padding_mask = self.model.encoder_frontend(batch.seqs, batch.padding_mask) - encoder_output, padding_mask = self.model.encoder(seqs, padding_mask) - return encoder_output, padding_mask + seqs, layout = self.model.encoder_frontend(seqs, layout) + encoder_output = self.model.encoder(seqs, layout) + return encoder_output class SpeechTokenizer(torch.nn.Module): @@ -111,11 +112,10 @@ def __init__( self, checkpoint: Union[str, None] = "data/checkpoints/xlsr2_1b_v2_custom.pt", kmeans_layer_checkpoint: str = "data/checkpoints/kmeans_10k.npy", - dtype: DataType = torch.float16, - device: Optional[Device] = None, + dtype: torch.dtype = torch.float16, + device: Optional[torch.device] = None, ) -> None: super().__init__() - self.collater = Collater(pad_value=1, pad_to_multiple=2) if device is None: device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu") @@ -134,25 +134,37 @@ def device(self): def dtype(self): return next(self.parameters()).dtype - def create_batch(self, x: BATCH_INPUT) -> SequenceBatch: - src = self.collater(x) - seqs, padding_mask = get_seqs_and_padding_mask(src) - batch = SequenceBatch(seqs=seqs, padding_mask=padding_mask) - return batch + def create_batch(self, x: BATCH_INPUT) -> Tuple[torch.Tensor, BatchLayout]: + if isinstance(x, torch.Tensor): + x = [x] + + lens: List[int] = [int(t.shape[0]) for t in x] + # Original code padded with 1, but for an audio model 0 makes more sense + seqs = pad_sequence(x, batch_first=True, padding_value=0.0) + seqs = seqs.to(self.device, self.dtype) + + B, T_max = int(seqs.size(0)), int(seqs.size(1)) + layout = BatchLayout(shape=(B, T_max), seq_lens=lens, device=seqs.device) + return seqs, layout @torch.inference_mode() - def forward(self, batch: SequenceBatch) -> Tuple[torch.Tensor, PaddingMask]: - self.cuda_stream.wait_stream(torch.cuda.current_stream()) - with torch.cuda.stream(self.cuda_stream): - z, padding_mask = self.encoder(batch) + def forward(self, seqs: torch.Tensor, layout: BatchLayout) -> torch.Tensor: + + units = None + if torch.cuda.is_available(): + self.cuda_stream.wait_stream(torch.cuda.current_stream()) + with torch.cuda.stream(self.cuda_stream): + z = self.encoder(seqs, layout) + units = self.kmeans(z) + self.gpu_memory_manager.check_and_cleanup() + torch.cuda.current_stream().wait_stream(self.cuda_stream) + else: + z = self.encoder(seqs, layout) units = self.kmeans(z) - self.gpu_memory_manager.check_and_cleanup() - torch.cuda.current_stream().wait_stream(self.cuda_stream) - return units, padding_mask + return units @torch.inference_mode() def waveform_to_units(self, waveform: torch.Tensor) -> torch.Tensor: - waveform = waveform.to(self.device).to(self.dtype) - batch = self.create_batch(waveform) - units, _ = self(batch) + seqs, layout = self.create_batch(waveform) + units = self(seqs, layout) return units \ No newline at end of file diff --git a/src/playdiffusion/models/speech_tokenizer/xlsr_encoder.py b/src/playdiffusion/models/speech_tokenizer/xlsr_encoder.py index 1cc81ea..54992b5 100644 --- a/src/playdiffusion/models/speech_tokenizer/xlsr_encoder.py +++ b/src/playdiffusion/models/speech_tokenizer/xlsr_encoder.py @@ -1,14 +1,13 @@ from typing import Tuple, Union -from fairseq2.models.wav2vec2._factory import ( +from fairseq2.models.wav2vec2 import ( Wav2Vec2Factory, Wav2Vec2Config, Wav2Vec2EncoderConfig, + Wav2Vec2Model ) -from fairseq2.models.wav2vec2._model import Wav2Vec2Model -from fairseq2.nn.transformer import TransformerNormOrder -from fairseq2.typing import DataType, Device - +from fairseq2.models.transformer import TransformerNormOrder +import torch def _encoder_xlsr2_1b_v2() -> Wav2Vec2EncoderConfig: """ @@ -28,7 +27,7 @@ def _encoder_xlsr2_1b_v2() -> Wav2Vec2EncoderConfig: feature_extractor_layer_descs=layer_descs, # type: ignore feature_extractor_bias=True, feature_extractor_layer_norm_convs=True, - feature_gradient_scale=1.0, + feature_grad_scale=1.0, num_fbank_channels=0, fbank_stride=0, sample_fbank_every_k=0, @@ -74,7 +73,7 @@ def _xlsr2_1b_v2() -> Wav2Vec2Config: def load_xlsr_encoder( - device: Device, dtype: DataType, max_layer: Union[int, None] = 35 + device: torch.device, dtype: torch.dtype, max_layer: Union[int, None] = 35 ) -> Tuple[Wav2Vec2Model, Wav2Vec2Config, Wav2Vec2EncoderConfig]: """ build_xlsr_1b_v2 From 1ceaaf7d3b011d0da8466539a3d17bd1cb1b0e62 Mon Sep 17 00:00:00 2001 From: Christopher Oezbek Date: Tue, 7 Oct 2025 10:28:36 +0200 Subject: [PATCH 2/4] Updated .gitignore with some common files --- .gitignore | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.gitignore b/.gitignore index 691037e..6575026 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,6 @@ .venv/* +*.pyc +checkpoint*/* +.gradio/certificate.pem +.python-version +*.wav From 42393afdb64ee2eaa158fc3b9193f7c7ec3c5b29 Mon Sep 17 00:00:00 2001 From: Christopher Oezbek Date: Tue, 14 Oct 2025 14:33:17 +0200 Subject: [PATCH 3/4] Expose BatchLayout to callers to get output lens --- src/playdiffusion/inference.py | 4 +- .../speech_tokenizer/speech_tokenizer.py | 37 ++++++++++++------- 2 files changed, 25 insertions(+), 16 deletions(-) diff --git a/src/playdiffusion/inference.py b/src/playdiffusion/inference.py index adfc1b4..0a7287c 100644 --- a/src/playdiffusion/inference.py +++ b/src/playdiffusion/inference.py @@ -698,7 +698,7 @@ def inpaint(self, input: InpaintInput): print(f"Resampled wav: {resampled_wav.shape}") self.timer("Resample") with torch.inference_mode(): - input_audio_tokens = self.mm.speech_tokenizer.waveform_to_units( + input_audio_tokens, _ = self.mm.speech_tokenizer.waveform_to_units( resampled_wav.squeeze() ) print(f"Input audio tokens: {input_audio_tokens.shape}") @@ -844,7 +844,7 @@ def rvc(self, input: RVCInput): print(f"Resampled wav: {resampled_wav.shape}") self.timer("Resample") with torch.inference_mode(): - input_audio_tokens = self.mm.speech_tokenizer.waveform_to_units( + input_audio_tokens, _ = self.mm.speech_tokenizer.waveform_to_units( resampled_wav.squeeze() ) print(f"Input audio tokens: {input_audio_tokens.shape}") diff --git a/src/playdiffusion/models/speech_tokenizer/speech_tokenizer.py b/src/playdiffusion/models/speech_tokenizer/speech_tokenizer.py index 613b80a..b92c321 100644 --- a/src/playdiffusion/models/speech_tokenizer/speech_tokenizer.py +++ b/src/playdiffusion/models/speech_tokenizer/speech_tokenizer.py @@ -90,9 +90,9 @@ def forward(self, seqs: torch.Tensor, layout: BatchLayout) -> torch.Tensor: :param layout: The layout of the batch (containing sequence lengths). """ - seqs, layout = self.model.encoder_frontend(seqs, layout) - encoder_output = self.model.encoder(seqs, layout) - return encoder_output + seqs, layout_out = self.model.encoder_frontend(seqs, layout) + encoder_output = self.model.encoder(seqs, layout_out) + return encoder_output, layout_out class SpeechTokenizer(torch.nn.Module): @@ -144,27 +144,36 @@ def create_batch(self, x: BATCH_INPUT) -> Tuple[torch.Tensor, BatchLayout]: seqs = seqs.to(self.device, self.dtype) B, T_max = int(seqs.size(0)), int(seqs.size(1)) - layout = BatchLayout(shape=(B, T_max), seq_lens=lens, device=seqs.device) - return seqs, layout + seqs_layout = BatchLayout(shape=(B, T_max), seq_lens=lens, device=seqs.device) + return seqs, seqs_layout @torch.inference_mode() - def forward(self, seqs: torch.Tensor, layout: BatchLayout) -> torch.Tensor: + def forward(self, seqs: torch.Tensor, seqs_layout: BatchLayout) -> tuple[torch.Tensor, BatchLayout]: units = None if torch.cuda.is_available(): self.cuda_stream.wait_stream(torch.cuda.current_stream()) with torch.cuda.stream(self.cuda_stream): - z = self.encoder(seqs, layout) + z, unit_layout = self.encoder(seqs, seqs_layout) units = self.kmeans(z) self.gpu_memory_manager.check_and_cleanup() torch.cuda.current_stream().wait_stream(self.cuda_stream) else: - z = self.encoder(seqs, layout) - units = self.kmeans(z) - return units + z, unit_layout = self.encoder(seqs, seqs_layout) + units = self.kmeans(z) # Doesn't modify layout + return units, unit_layout @torch.inference_mode() - def waveform_to_units(self, waveform: torch.Tensor) -> torch.Tensor: - seqs, layout = self.create_batch(waveform) - units = self(seqs, layout) - return units \ No newline at end of file + def waveform_to_units(self, waveform: Union[torch.Tensor, List[torch.Tensor], Tuple[torch.Tensor, ...]]) -> tuple[torch.Tensor, BatchLayout]: + """ + Converts a single waveform tensors or a list of waveform tensors into audio tokens. + + Returns a batch of audio tokens [B, T] and the corresponding BatchLayout + Use unit_layout.seq_lens to get the length of the individual audio token tensors. + + Output units are tokens of dtype torch.int64 + 0 <= token < num_embeddings (e.g., 10000 for kmeans_10k.npy) + """ + seqs, seqs_layout = self.create_batch(waveform) + units, unit_layout = self(seqs, seqs_layout) + return units, unit_layout \ No newline at end of file From 667ed42de7aeba23932c19351eef544df44863cf Mon Sep 17 00:00:00 2001 From: Christopher Oezbek Date: Tue, 14 Oct 2025 14:33:38 +0200 Subject: [PATCH 4/4] Add torchcodec needed for gradio demo --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 2dbb78a..76757dc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -42,6 +42,7 @@ demo = [ "openai>=1.82.0", "openai-whisper>=20230314", "whisper-timestamped>=0.0.11", + "torchcodec>=0.7.0", ] [tool.hatch.build.targets.wheel]