diff --git a/src/cerebras/modelzoo/data_preparation/data_preprocessing/finetuning_token_generator.py b/src/cerebras/modelzoo/data_preparation/data_preprocessing/finetuning_token_generator.py index 4d66c287..7495397d 100644 --- a/src/cerebras/modelzoo/data_preparation/data_preprocessing/finetuning_token_generator.py +++ b/src/cerebras/modelzoo/data_preparation/data_preprocessing/finetuning_token_generator.py @@ -70,13 +70,15 @@ def __init__(self, params, tokenizer, eos_id, pad_id): ) self.tokenizer.chat_template = default_chat_template() - self.eos_id = eos_id - self.eos_token = ( - self.tokenizer.convert_ids_to_tokens(self.pad_id) - if self.eos_id is None - else self.tokenizer.convert_ids_to_tokens(self.eos_id) - ) - self.pad_id = pad_id + self.pad_id = pad_id + # Assign pad_id first so it's available for use below + self.eos_id = eos_id + # Set EOS token, using pad_id as fallback if eos_id is None + self.eos_token = ( + self.tokenizer.convert_ids_to_tokens(self.pad_id) + if self.eos_id is None + else self.tokenizer.convert_ids_to_tokens(self.eos_id) + ) self.features = ["input_ids", "attention_mask", "labels"] self.semantic_loss_weight = processing_params.pop( "semantic_loss_weight", {} diff --git a/src/cerebras/modelzoo/data_preparation/vision/dit/vae_hf_cs.py b/src/cerebras/modelzoo/data_preparation/vision/dit/vae_hf_cs.py index ddf366ee..cb249ca5 100644 --- a/src/cerebras/modelzoo/data_preparation/vision/dit/vae_hf_cs.py +++ b/src/cerebras/modelzoo/data_preparation/vision/dit/vae_hf_cs.py @@ -14,6 +14,8 @@ # isort: off import sys +# ensure os is available before it's used below +import os sys.path.append(os.path.join(os.path.dirname(__file__), "../../../")) # isort: on