From df967dcf1f1ef464e2921d49c810affa5eff4c0a Mon Sep 17 00:00:00 2001 From: shaxx Date: Mon, 29 Sep 2025 17:29:43 +0530 Subject: [PATCH 1/2] fix: add missing import os to vae_hf_cs.py --- src/cerebras/modelzoo/data_preparation/vision/dit/vae_hf_cs.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/cerebras/modelzoo/data_preparation/vision/dit/vae_hf_cs.py b/src/cerebras/modelzoo/data_preparation/vision/dit/vae_hf_cs.py index ddf366ee..cb249ca5 100644 --- a/src/cerebras/modelzoo/data_preparation/vision/dit/vae_hf_cs.py +++ b/src/cerebras/modelzoo/data_preparation/vision/dit/vae_hf_cs.py @@ -14,6 +14,8 @@ # isort: off import sys +# ensure os is available before it's used below +import os sys.path.append(os.path.join(os.path.dirname(__file__), "../../../")) # isort: on From 3aa999371625514dd2f2359ea8d2657d05e9a78e Mon Sep 17 00:00:00 2001 From: shaxx Date: Mon, 29 Sep 2025 18:41:34 +0530 Subject: [PATCH 2/2] fix pad_id assignment order in finetuningtokengenerator --- .../finetuning_token_generator.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/src/cerebras/modelzoo/data_preparation/data_preprocessing/finetuning_token_generator.py b/src/cerebras/modelzoo/data_preparation/data_preprocessing/finetuning_token_generator.py index 4d66c287..7495397d 100644 --- a/src/cerebras/modelzoo/data_preparation/data_preprocessing/finetuning_token_generator.py +++ b/src/cerebras/modelzoo/data_preparation/data_preprocessing/finetuning_token_generator.py @@ -70,13 +70,15 @@ def __init__(self, params, tokenizer, eos_id, pad_id): ) self.tokenizer.chat_template = default_chat_template() - self.eos_id = eos_id - self.eos_token = ( - self.tokenizer.convert_ids_to_tokens(self.pad_id) - if self.eos_id is None - else self.tokenizer.convert_ids_to_tokens(self.eos_id) - ) - self.pad_id = pad_id + self.pad_id = pad_id + # Assign pad_id first so it's available for use below + self.eos_id = eos_id + # Set EOS token, using pad_id as fallback if eos_id is None + self.eos_token = ( + self.tokenizer.convert_ids_to_tokens(self.pad_id) + if self.eos_id is None + else self.tokenizer.convert_ids_to_tokens(self.eos_id) + ) self.features = ["input_ids", "attention_mask", "labels"] self.semantic_loss_weight = processing_params.pop( "semantic_loss_weight", {}