From d0e576fd438644a0337091dd6587f918e3c8e531 Mon Sep 17 00:00:00 2001 From: Nipers Date: Mon, 20 Jan 2025 18:50:13 +0800 Subject: [PATCH 1/3] apply liger kernel for qwenmodels in ActorRolloutRefWorker and PRIMERewardModelWorker --- training/verl/workers/fsdp_workers.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/training/verl/workers/fsdp_workers.py b/training/verl/workers/fsdp_workers.py index 5aa1328..315de4c 100644 --- a/training/verl/workers/fsdp_workers.py +++ b/training/verl/workers/fsdp_workers.py @@ -134,6 +134,16 @@ def _build_model_optimizer(self, # NOTE(fix me): tie_word_embedding causes meta_tensor init to hang init_context = get_init_weight_context_manager(use_meta_tensor=not actor_model_config.tie_word_embeddings) + # Apply Liger kernel optimizations to Qwen2 model + from liger_kernel.transformers import apply_liger_kernel_to_qwen2 + apply_liger_kernel_to_qwen2( + rope=False, + cross_entropy=False, + fused_linear_cross_entropy=True, + rms_norm=True, + swiglu=True + ) + with init_context(), warnings.catch_warnings(): warnings.simplefilter("ignore") actor_module = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=local_path, @@ -690,6 +700,16 @@ def _build_model(self, config): # note that we have to create model in fp32. Otherwise, the optimizer is in bf16, which is incorrect init_context = get_init_weight_context_manager(use_meta_tensor=not model_config.tie_word_embeddings) + # Apply Liger kernel optimizations to Qwen2 model + from liger_kernel.transformers import apply_liger_kernel_to_qwen2 + apply_liger_kernel_to_qwen2( + rope=False, + cross_entropy=False, + fused_linear_cross_entropy=True, + rms_norm=True, + swiglu=True + ) + with init_context(), warnings.catch_warnings(): warnings.simplefilter("ignore") reward_module = AutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path=local_path, From 497d010a859104eef1f64f2bba1759b374e68f79 Mon Sep 17 00:00:00 2001 From: Ganqu CUI <790799494@qq.com> Date: Sat, 25 Jan 2025 09:14:34 +0800 Subject: [PATCH 2/3] fix precision --- training/verl/workers/fsdp_workers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/training/verl/workers/fsdp_workers.py b/training/verl/workers/fsdp_workers.py index 315de4c..e565687 100644 --- a/training/verl/workers/fsdp_workers.py +++ b/training/verl/workers/fsdp_workers.py @@ -892,10 +892,10 @@ def _build_model_optimizer(self, config, enable_gradient_checkpointing=False): with init_context(), warnings.catch_warnings(): warnings.simplefilter("ignore") reward_module = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=local_path, - torch_dtype=torch.bfloat16, + torch_dtype=torch.float32, attn_implementation='flash_attention_2', trust_remote_code=trust_remote_code) - reward_module.to(torch.bfloat16) + reward_module.to(torch.float32) if enable_gradient_checkpointing: reward_module.gradient_checkpointing_enable() from torch.distributed.fsdp import FullyShardedDataParallel as FSDP, ShardingStrategy, MixedPrecision From 56d99808e38784e7da08d790b67088a3f7f797b9 Mon Sep 17 00:00:00 2001 From: Nipers Date: Mon, 27 Jan 2025 21:34:47 +0800 Subject: [PATCH 3/3] add liger init operations --- training/verl/workers/fsdp_workers.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/training/verl/workers/fsdp_workers.py b/training/verl/workers/fsdp_workers.py index e565687..adf2653 100644 --- a/training/verl/workers/fsdp_workers.py +++ b/training/verl/workers/fsdp_workers.py @@ -700,16 +700,6 @@ def _build_model(self, config): # note that we have to create model in fp32. Otherwise, the optimizer is in bf16, which is incorrect init_context = get_init_weight_context_manager(use_meta_tensor=not model_config.tie_word_embeddings) - # Apply Liger kernel optimizations to Qwen2 model - from liger_kernel.transformers import apply_liger_kernel_to_qwen2 - apply_liger_kernel_to_qwen2( - rope=False, - cross_entropy=False, - fused_linear_cross_entropy=True, - rms_norm=True, - swiglu=True - ) - with init_context(), warnings.catch_warnings(): warnings.simplefilter("ignore") reward_module = AutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path=local_path, @@ -889,6 +879,16 @@ def _build_model_optimizer(self, config, enable_gradient_checkpointing=False): check_model_support_rmpad(model_config.model_type) init_context = get_init_weight_context_manager(use_meta_tensor=not model_config.tie_word_embeddings) + # Apply Liger kernel optimizations to Qwen2 model + from liger_kernel.transformers import apply_liger_kernel_to_qwen2 + apply_liger_kernel_to_qwen2( + rope=False, + cross_entropy=False, + fused_linear_cross_entropy=True, + rms_norm=True, + swiglu=True + ) + with init_context(), warnings.catch_warnings(): warnings.simplefilter("ignore") reward_module = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=local_path,