From 0d301a2f6682dd7a2884e12b5824f3e89598e1b4 Mon Sep 17 00:00:00 2001 From: zouxinyi0625 Date: Tue, 23 Sep 2025 11:50:04 +0800 Subject: [PATCH 01/17] update main --- ...475_train_mcore_vllm_qwen2_5_vl_7b_grpo.sh | 69 +++++++++++++++++++ 1 file changed, 69 insertions(+) create mode 100644 scripts/mcore_vllm/.efc_6481242819968726066_18381160408474772601_1758599312638475_train_mcore_vllm_qwen2_5_vl_7b_grpo.sh diff --git a/scripts/mcore_vllm/.efc_6481242819968726066_18381160408474772601_1758599312638475_train_mcore_vllm_qwen2_5_vl_7b_grpo.sh b/scripts/mcore_vllm/.efc_6481242819968726066_18381160408474772601_1758599312638475_train_mcore_vllm_qwen2_5_vl_7b_grpo.sh new file mode 100644 index 00000000..8134f949 --- /dev/null +++ b/scripts/mcore_vllm/.efc_6481242819968726066_18381160408474772601_1758599312638475_train_mcore_vllm_qwen2_5_vl_7b_grpo.sh @@ -0,0 +1,69 @@ +#!/bin/bash +set -x + +# Tested on 8xH20-3e with 140G VRAM +export RAY_CGRAPH_get_timeout=200 +export CUDA_DEVICE_MAX_CONNECTIONS=1 +export RAY_DEDUP_LOGS=1 +export VLLM_USE_RAY_SPMD_WORKER=1 +export VLLM_USE_RAY_COMPILED_DAG=1 + +export CHATLEARN=$(pwd) +export MEGATRON_PATH=${CHATLEARN}/../Pai-Megatron-Patch/backends/megatron/Megatron-LM-250624 +export MEGATRON_PATCH_PATH=${CHATLEARN}/../Pai-Megatron-Patch +export PYTHONPATH=${CHATLEARN}:${MEGATRON_PATCH_PATH}:${MEGATRON_PATH}:${PYTHONPATH} + +source scripts/base_env.sh + +hf_ckpt_path=${CHATLEARN}/pretrained_models/Qwen2.5-VL-7B-Instruct +mcore_ckpt_path=${CHATLEARN}/pretrained_models/Qwen2.5-VL-7B-Instruct-to-mcore + +exp_name="test_qwen25_vl_7b" +export output_dir=${CHATLEARN}/output/${exp_name} +mkdir -p $output_dir/ +export log_dir=${output_dir}/logs +mkdir -p $log_dir +log_file=$log_dir/${exp_name}_rank${RANK}.log + +python chatlearn/entrypoint.py grpo --config-file template/grpo_megatron.yaml \ + runtime_args.exp_name=${exp_name} \ + runtime_args.log_args_dict.enable_tensorboard=True \ + runtime_args.train_backend=megatron \ + runtime_args.data_path=${CHATLEARN}/dataset/geo3k/train.parquet \ + runtime_args.eval_data_path=${CHATLEARN}/dataset/geo3k/test.parquet \ + runtime_args.output_dir=${CHATLEARN}/output/${exp_name} \ + runtime_args.num_episode=50 \ + runtime_args.sample_per_episode=2048 \ + runtime_args.train_global_batch_size=2048 \ + runtime_args.train_micro_batch_size=1 \ + runtime_args.save_episode_interval=1000000 \ + runtime_args.log_args_dict.enable_tensorboard=True \ + runtime_args.log_args_dict.tensorboard_dir=${output_dir}/tensorboard \ + runtime_args.eval_episode_interval=1 \ + runtime_args.enable_eval_before_training=True \ + runtime_args.model_type=vlm \ + models.policy_trainer.num_gpu=${num_device} \ + models.policy_trainer.packing=False \ + models.policy_trainer.max_token_in_packing=8192 \ + models.policy_trainer.bf16=True \ + models.policy_trainer.sequence_parallel=True \ + models.policy_trainer.use_distributed_optimizer=True \ + models.policy_trainer.recompute_granularity=null \ + models.policy_trainer.tensor_model_parallel_size=2 \ + models.policy_trainer.pipeline_model_parallel_size=1 \ + models.policy_trainer.generation_batch_size=512 \ + models.policy_trainer.load=${mcore_ckpt_path} \ + models.policy_trainer.optimizer.lr=2e-6 \ + models.policy_trainer.optimizer.min_lr=2e-6 \ + models.policy_trainer.pos_clip_ratio=0.2 \ + models.policy_trainer.neg_clip_ratio=0.2 \ + models.reward.generation_batch_size=128 \ + models.policy.load=${hf_ckpt_path} \ + models.policy.generation_batch_size=256 \ + models.policy.tensor_model_parallel_size=1 \ + models.policy.max_prompt_tokens_length=1024 \ + models.policy.max_response_tokens_length=2048 \ + models.policy.num_inference_per_prompt=32 \ + models.policy.gpu_memory_utilization=0.75 \ + models.policy.enable_thinking=False \ + 2>&1 | tee ${log_file} ; exit ${PIPESTATUS[0]} \ No newline at end of file From 0f822a2b2e36e8080a9c905855729734d3a18fce Mon Sep 17 00:00:00 2001 From: zouxinyi0625 Date: Tue, 23 Sep 2025 11:56:14 +0800 Subject: [PATCH 02/17] remove --- ...475_train_mcore_vllm_qwen2_5_vl_7b_grpo.sh | 69 ------------------- 1 file changed, 69 deletions(-) delete mode 100644 scripts/mcore_vllm/.efc_6481242819968726066_18381160408474772601_1758599312638475_train_mcore_vllm_qwen2_5_vl_7b_grpo.sh diff --git a/scripts/mcore_vllm/.efc_6481242819968726066_18381160408474772601_1758599312638475_train_mcore_vllm_qwen2_5_vl_7b_grpo.sh b/scripts/mcore_vllm/.efc_6481242819968726066_18381160408474772601_1758599312638475_train_mcore_vllm_qwen2_5_vl_7b_grpo.sh deleted file mode 100644 index 8134f949..00000000 --- a/scripts/mcore_vllm/.efc_6481242819968726066_18381160408474772601_1758599312638475_train_mcore_vllm_qwen2_5_vl_7b_grpo.sh +++ /dev/null @@ -1,69 +0,0 @@ -#!/bin/bash -set -x - -# Tested on 8xH20-3e with 140G VRAM -export RAY_CGRAPH_get_timeout=200 -export CUDA_DEVICE_MAX_CONNECTIONS=1 -export RAY_DEDUP_LOGS=1 -export VLLM_USE_RAY_SPMD_WORKER=1 -export VLLM_USE_RAY_COMPILED_DAG=1 - -export CHATLEARN=$(pwd) -export MEGATRON_PATH=${CHATLEARN}/../Pai-Megatron-Patch/backends/megatron/Megatron-LM-250624 -export MEGATRON_PATCH_PATH=${CHATLEARN}/../Pai-Megatron-Patch -export PYTHONPATH=${CHATLEARN}:${MEGATRON_PATCH_PATH}:${MEGATRON_PATH}:${PYTHONPATH} - -source scripts/base_env.sh - -hf_ckpt_path=${CHATLEARN}/pretrained_models/Qwen2.5-VL-7B-Instruct -mcore_ckpt_path=${CHATLEARN}/pretrained_models/Qwen2.5-VL-7B-Instruct-to-mcore - -exp_name="test_qwen25_vl_7b" -export output_dir=${CHATLEARN}/output/${exp_name} -mkdir -p $output_dir/ -export log_dir=${output_dir}/logs -mkdir -p $log_dir -log_file=$log_dir/${exp_name}_rank${RANK}.log - -python chatlearn/entrypoint.py grpo --config-file template/grpo_megatron.yaml \ - runtime_args.exp_name=${exp_name} \ - runtime_args.log_args_dict.enable_tensorboard=True \ - runtime_args.train_backend=megatron \ - runtime_args.data_path=${CHATLEARN}/dataset/geo3k/train.parquet \ - runtime_args.eval_data_path=${CHATLEARN}/dataset/geo3k/test.parquet \ - runtime_args.output_dir=${CHATLEARN}/output/${exp_name} \ - runtime_args.num_episode=50 \ - runtime_args.sample_per_episode=2048 \ - runtime_args.train_global_batch_size=2048 \ - runtime_args.train_micro_batch_size=1 \ - runtime_args.save_episode_interval=1000000 \ - runtime_args.log_args_dict.enable_tensorboard=True \ - runtime_args.log_args_dict.tensorboard_dir=${output_dir}/tensorboard \ - runtime_args.eval_episode_interval=1 \ - runtime_args.enable_eval_before_training=True \ - runtime_args.model_type=vlm \ - models.policy_trainer.num_gpu=${num_device} \ - models.policy_trainer.packing=False \ - models.policy_trainer.max_token_in_packing=8192 \ - models.policy_trainer.bf16=True \ - models.policy_trainer.sequence_parallel=True \ - models.policy_trainer.use_distributed_optimizer=True \ - models.policy_trainer.recompute_granularity=null \ - models.policy_trainer.tensor_model_parallel_size=2 \ - models.policy_trainer.pipeline_model_parallel_size=1 \ - models.policy_trainer.generation_batch_size=512 \ - models.policy_trainer.load=${mcore_ckpt_path} \ - models.policy_trainer.optimizer.lr=2e-6 \ - models.policy_trainer.optimizer.min_lr=2e-6 \ - models.policy_trainer.pos_clip_ratio=0.2 \ - models.policy_trainer.neg_clip_ratio=0.2 \ - models.reward.generation_batch_size=128 \ - models.policy.load=${hf_ckpt_path} \ - models.policy.generation_batch_size=256 \ - models.policy.tensor_model_parallel_size=1 \ - models.policy.max_prompt_tokens_length=1024 \ - models.policy.max_response_tokens_length=2048 \ - models.policy.num_inference_per_prompt=32 \ - models.policy.gpu_memory_utilization=0.75 \ - models.policy.enable_thinking=False \ - 2>&1 | tee ${log_file} ; exit ${PIPESTATUS[0]} \ No newline at end of file From d47817ea583733ed59cbe29eba1c84a036790f0b Mon Sep 17 00:00:00 2001 From: zouxinyi0625 Date: Tue, 14 Oct 2025 10:43:48 +0800 Subject: [PATCH 03/17] update --- .../algorithm/grpo_utils/loss_gallery.py | 6 +- .../algorithm/grpo_utils/policy_trainer.py | 2 + chatlearn/models/fsdp_module.py | 8 +- chatlearn/models/sglang_module.py | 8 +- model_keys.txt | 0 qwen3vl_model_keys.txt | 882 ++++++++++++++++++ 6 files changed, 897 insertions(+), 9 deletions(-) create mode 100644 model_keys.txt create mode 100644 qwen3vl_model_keys.txt diff --git a/chatlearn/algorithm/grpo_utils/loss_gallery.py b/chatlearn/algorithm/grpo_utils/loss_gallery.py index 0463cb6c..5b7f3165 100644 --- a/chatlearn/algorithm/grpo_utils/loss_gallery.py +++ b/chatlearn/algorithm/grpo_utils/loss_gallery.py @@ -16,7 +16,8 @@ def calculate_grpo_loss( # clip logprobs_diff before exp to avoid overflow logprobs_diff = torch.clamp(logprobs_diff, max=diff_clip_ratio) ratio = torch.exp(logprobs_diff) - advantages = torch.tensor(advantages).to(logprobs_diff.device) + # advantages = torch.tensor(advantages).to(logprobs_diff.device) + advantages = torch.tensor(advantages, dtype=logprobs_diff.dtype, device=logprobs_diff.device) pg_loss = -advantages.unsqueeze(-1) * ratio # Upper and lower bound clip pg_loss_2 = -advantages.unsqueeze(-1) * torch.clamp( @@ -50,7 +51,8 @@ def calculate_gspo_loss( logprobs_diff = torch.clamp(seq_logprobs_diff, max=diff_clip_ratio) ratio = torch.exp(logprobs_diff) - advantages = torch.tensor(advantages).to(logprobs_diff.device) + # advantages = torch.tensor(advantages).to(logprobs_diff.device) + advantages = torch.tensor(advantages, dtype=logprobs_diff.dtype, device=logprobs_diff.device) advantages.unsqueeze_(-1) pg_loss = -advantages * ratio diff --git a/chatlearn/algorithm/grpo_utils/policy_trainer.py b/chatlearn/algorithm/grpo_utils/policy_trainer.py index 8a69b45f..36fb87e4 100644 --- a/chatlearn/algorithm/grpo_utils/policy_trainer.py +++ b/chatlearn/algorithm/grpo_utils/policy_trainer.py @@ -270,6 +270,8 @@ def train_step(self, data_list: List[Dict[str, Any]], **kwargs): # pylint: disab total_loss = total_loss - self.module_args.entropy_coef * entropy_loss_mean if self.module_args.kl_coef > 0: total_loss = total_loss + self.module_args.kl_coef * kl_loss_mean + # breakpoint() + # total_loss = total_loss.bfloat16() total_loss.backward() pg_loss_list.append(pg_loss.detach()) diff --git a/chatlearn/models/fsdp_module.py b/chatlearn/models/fsdp_module.py index 9709e2b7..ef301f92 100644 --- a/chatlearn/models/fsdp_module.py +++ b/chatlearn/models/fsdp_module.py @@ -344,7 +344,7 @@ def model_setup(self): model.gradient_checkpointing_enable(gradient_checkpointing_kwargs={'use_reentrant': False}) # fsdp2 warp - mix_precision_config = MixedPrecisionPolicy(param_dtype=torch.bfloat16, reduce_dtype=torch.float32, cast_forward_inputs=True) + mix_precision_config = MixedPrecisionPolicy(param_dtype=torch.float32, reduce_dtype=torch.float32, cast_forward_inputs=True) fsdp_kwargs = { "mesh": self.device_mesh, "mp_policy": mix_precision_config, @@ -371,6 +371,7 @@ def model_setup(self): self.model = model self.model.to(torch.float32) + # breakpoint() if not self.trainable: self.optimizer = None @@ -507,9 +508,10 @@ def get_weight_ipc_handles_by_name(self, block_name: List[str]): if rollout_engine == "sglang": # lazy import sglang from sglang.srt.utils import MultiprocessingSerializer - from sglang.srt.patch_torch import monkey_patch_torch_reductions + # from sglang.srt.patch_torch import monkey_patch_torch_reductions - monkey_patch_torch_reductions() + # monkey_patch_torch_reductions() + flattened_tensor, metadatas = self.convert_block2flattened_bucket( block_parameter ) diff --git a/chatlearn/models/sglang_module.py b/chatlearn/models/sglang_module.py index dfb91970..04357591 100644 --- a/chatlearn/models/sglang_module.py +++ b/chatlearn/models/sglang_module.py @@ -570,8 +570,8 @@ def parameter_sync(self): def update_weights_from_buckets(self, buckets: List[Optional['BucketInfo']]): """Used for Mcore2SGLang Parameter Sync """ - from sglang.srt.patch_torch import monkey_patch_torch_reductions - monkey_patch_torch_reductions() + # from sglang.srt.patch_torch import monkey_patch_torch_reductions + # monkey_patch_torch_reductions() param_id_to_update = set() for bucket in buckets: if bucket is None: @@ -766,8 +766,8 @@ async def update_weights_from_ipc_handles(self, reduce_data): @torch.no_grad() async def update_weights_from_buckets(self, buckets: List[Optional['BucketInfo']]): - from sglang.srt.patch_torch import monkey_patch_torch_reductions - monkey_patch_torch_reductions() + # from sglang.srt.patch_torch import monkey_patch_torch_reductions + # monkey_patch_torch_reductions() param_id_to_update = set() for bucket in buckets: if bucket is None: diff --git a/model_keys.txt b/model_keys.txt new file mode 100644 index 00000000..e69de29b diff --git a/qwen3vl_model_keys.txt b/qwen3vl_model_keys.txt new file mode 100644 index 00000000..f100c5b8 --- /dev/null +++ b/qwen3vl_model_keys.txt @@ -0,0 +1,882 @@ +model.visual.patch_embed.proj.weight +model.visual.patch_embed.proj.bias +model.visual.pos_embed.weight +model.visual.blocks.0.norm1.weight +model.visual.blocks.0.norm1.bias +model.visual.blocks.0.norm2.weight +model.visual.blocks.0.norm2.bias +model.visual.blocks.0.attn.qkv.weight +model.visual.blocks.0.attn.qkv.bias +model.visual.blocks.0.attn.proj.weight +model.visual.blocks.0.attn.proj.bias +model.visual.blocks.0.mlp.linear_fc1.weight +model.visual.blocks.0.mlp.linear_fc1.bias +model.visual.blocks.0.mlp.linear_fc2.weight +model.visual.blocks.0.mlp.linear_fc2.bias +model.visual.blocks.1.norm1.weight +model.visual.blocks.1.norm1.bias +model.visual.blocks.1.norm2.weight +model.visual.blocks.1.norm2.bias +model.visual.blocks.1.attn.qkv.weight +model.visual.blocks.1.attn.qkv.bias +model.visual.blocks.1.attn.proj.weight +model.visual.blocks.1.attn.proj.bias +model.visual.blocks.1.mlp.linear_fc1.weight +model.visual.blocks.1.mlp.linear_fc1.bias +model.visual.blocks.1.mlp.linear_fc2.weight +model.visual.blocks.1.mlp.linear_fc2.bias +model.visual.blocks.2.norm1.weight +model.visual.blocks.2.norm1.bias +model.visual.blocks.2.norm2.weight +model.visual.blocks.2.norm2.bias +model.visual.blocks.2.attn.qkv.weight +model.visual.blocks.2.attn.qkv.bias +model.visual.blocks.2.attn.proj.weight +model.visual.blocks.2.attn.proj.bias +model.visual.blocks.2.mlp.linear_fc1.weight +model.visual.blocks.2.mlp.linear_fc1.bias +model.visual.blocks.2.mlp.linear_fc2.weight +model.visual.blocks.2.mlp.linear_fc2.bias +model.visual.blocks.3.norm1.weight +model.visual.blocks.3.norm1.bias +model.visual.blocks.3.norm2.weight +model.visual.blocks.3.norm2.bias +model.visual.blocks.3.attn.qkv.weight +model.visual.blocks.3.attn.qkv.bias +model.visual.blocks.3.attn.proj.weight +model.visual.blocks.3.attn.proj.bias +model.visual.blocks.3.mlp.linear_fc1.weight +model.visual.blocks.3.mlp.linear_fc1.bias +model.visual.blocks.3.mlp.linear_fc2.weight +model.visual.blocks.3.mlp.linear_fc2.bias +model.visual.blocks.4.norm1.weight +model.visual.blocks.4.norm1.bias +model.visual.blocks.4.norm2.weight +model.visual.blocks.4.norm2.bias +model.visual.blocks.4.attn.qkv.weight +model.visual.blocks.4.attn.qkv.bias +model.visual.blocks.4.attn.proj.weight +model.visual.blocks.4.attn.proj.bias +model.visual.blocks.4.mlp.linear_fc1.weight +model.visual.blocks.4.mlp.linear_fc1.bias +model.visual.blocks.4.mlp.linear_fc2.weight +model.visual.blocks.4.mlp.linear_fc2.bias +model.visual.blocks.5.norm1.weight +model.visual.blocks.5.norm1.bias +model.visual.blocks.5.norm2.weight +model.visual.blocks.5.norm2.bias +model.visual.blocks.5.attn.qkv.weight +model.visual.blocks.5.attn.qkv.bias +model.visual.blocks.5.attn.proj.weight +model.visual.blocks.5.attn.proj.bias +model.visual.blocks.5.mlp.linear_fc1.weight +model.visual.blocks.5.mlp.linear_fc1.bias +model.visual.blocks.5.mlp.linear_fc2.weight +model.visual.blocks.5.mlp.linear_fc2.bias +model.visual.blocks.6.norm1.weight +model.visual.blocks.6.norm1.bias +model.visual.blocks.6.norm2.weight +model.visual.blocks.6.norm2.bias +model.visual.blocks.6.attn.qkv.weight +model.visual.blocks.6.attn.qkv.bias +model.visual.blocks.6.attn.proj.weight +model.visual.blocks.6.attn.proj.bias +model.visual.blocks.6.mlp.linear_fc1.weight +model.visual.blocks.6.mlp.linear_fc1.bias +model.visual.blocks.6.mlp.linear_fc2.weight +model.visual.blocks.6.mlp.linear_fc2.bias +model.visual.blocks.7.norm1.weight +model.visual.blocks.7.norm1.bias +model.visual.blocks.7.norm2.weight +model.visual.blocks.7.norm2.bias +model.visual.blocks.7.attn.qkv.weight +model.visual.blocks.7.attn.qkv.bias +model.visual.blocks.7.attn.proj.weight +model.visual.blocks.7.attn.proj.bias +model.visual.blocks.7.mlp.linear_fc1.weight +model.visual.blocks.7.mlp.linear_fc1.bias +model.visual.blocks.7.mlp.linear_fc2.weight +model.visual.blocks.7.mlp.linear_fc2.bias +model.visual.blocks.8.norm1.weight +model.visual.blocks.8.norm1.bias +model.visual.blocks.8.norm2.weight +model.visual.blocks.8.norm2.bias +model.visual.blocks.8.attn.qkv.weight +model.visual.blocks.8.attn.qkv.bias +model.visual.blocks.8.attn.proj.weight +model.visual.blocks.8.attn.proj.bias +model.visual.blocks.8.mlp.linear_fc1.weight +model.visual.blocks.8.mlp.linear_fc1.bias +model.visual.blocks.8.mlp.linear_fc2.weight +model.visual.blocks.8.mlp.linear_fc2.bias +model.visual.blocks.9.norm1.weight +model.visual.blocks.9.norm1.bias +model.visual.blocks.9.norm2.weight +model.visual.blocks.9.norm2.bias +model.visual.blocks.9.attn.qkv.weight +model.visual.blocks.9.attn.qkv.bias +model.visual.blocks.9.attn.proj.weight +model.visual.blocks.9.attn.proj.bias +model.visual.blocks.9.mlp.linear_fc1.weight +model.visual.blocks.9.mlp.linear_fc1.bias +model.visual.blocks.9.mlp.linear_fc2.weight +model.visual.blocks.9.mlp.linear_fc2.bias +model.visual.blocks.10.norm1.weight +model.visual.blocks.10.norm1.bias +model.visual.blocks.10.norm2.weight +model.visual.blocks.10.norm2.bias +model.visual.blocks.10.attn.qkv.weight +model.visual.blocks.10.attn.qkv.bias +model.visual.blocks.10.attn.proj.weight +model.visual.blocks.10.attn.proj.bias +model.visual.blocks.10.mlp.linear_fc1.weight +model.visual.blocks.10.mlp.linear_fc1.bias +model.visual.blocks.10.mlp.linear_fc2.weight +model.visual.blocks.10.mlp.linear_fc2.bias +model.visual.blocks.11.norm1.weight +model.visual.blocks.11.norm1.bias +model.visual.blocks.11.norm2.weight +model.visual.blocks.11.norm2.bias +model.visual.blocks.11.attn.qkv.weight +model.visual.blocks.11.attn.qkv.bias +model.visual.blocks.11.attn.proj.weight +model.visual.blocks.11.attn.proj.bias +model.visual.blocks.11.mlp.linear_fc1.weight +model.visual.blocks.11.mlp.linear_fc1.bias +model.visual.blocks.11.mlp.linear_fc2.weight +model.visual.blocks.11.mlp.linear_fc2.bias +model.visual.blocks.12.norm1.weight +model.visual.blocks.12.norm1.bias +model.visual.blocks.12.norm2.weight +model.visual.blocks.12.norm2.bias +model.visual.blocks.12.attn.qkv.weight +model.visual.blocks.12.attn.qkv.bias +model.visual.blocks.12.attn.proj.weight +model.visual.blocks.12.attn.proj.bias +model.visual.blocks.12.mlp.linear_fc1.weight +model.visual.blocks.12.mlp.linear_fc1.bias +model.visual.blocks.12.mlp.linear_fc2.weight +model.visual.blocks.12.mlp.linear_fc2.bias +model.visual.blocks.13.norm1.weight +model.visual.blocks.13.norm1.bias +model.visual.blocks.13.norm2.weight +model.visual.blocks.13.norm2.bias +model.visual.blocks.13.attn.qkv.weight +model.visual.blocks.13.attn.qkv.bias +model.visual.blocks.13.attn.proj.weight +model.visual.blocks.13.attn.proj.bias +model.visual.blocks.13.mlp.linear_fc1.weight +model.visual.blocks.13.mlp.linear_fc1.bias +model.visual.blocks.13.mlp.linear_fc2.weight +model.visual.blocks.13.mlp.linear_fc2.bias +model.visual.blocks.14.norm1.weight +model.visual.blocks.14.norm1.bias +model.visual.blocks.14.norm2.weight +model.visual.blocks.14.norm2.bias +model.visual.blocks.14.attn.qkv.weight +model.visual.blocks.14.attn.qkv.bias +model.visual.blocks.14.attn.proj.weight +model.visual.blocks.14.attn.proj.bias +model.visual.blocks.14.mlp.linear_fc1.weight +model.visual.blocks.14.mlp.linear_fc1.bias +model.visual.blocks.14.mlp.linear_fc2.weight +model.visual.blocks.14.mlp.linear_fc2.bias +model.visual.blocks.15.norm1.weight +model.visual.blocks.15.norm1.bias +model.visual.blocks.15.norm2.weight +model.visual.blocks.15.norm2.bias +model.visual.blocks.15.attn.qkv.weight +model.visual.blocks.15.attn.qkv.bias +model.visual.blocks.15.attn.proj.weight +model.visual.blocks.15.attn.proj.bias +model.visual.blocks.15.mlp.linear_fc1.weight +model.visual.blocks.15.mlp.linear_fc1.bias +model.visual.blocks.15.mlp.linear_fc2.weight +model.visual.blocks.15.mlp.linear_fc2.bias +model.visual.blocks.16.norm1.weight +model.visual.blocks.16.norm1.bias +model.visual.blocks.16.norm2.weight +model.visual.blocks.16.norm2.bias +model.visual.blocks.16.attn.qkv.weight +model.visual.blocks.16.attn.qkv.bias +model.visual.blocks.16.attn.proj.weight +model.visual.blocks.16.attn.proj.bias +model.visual.blocks.16.mlp.linear_fc1.weight +model.visual.blocks.16.mlp.linear_fc1.bias +model.visual.blocks.16.mlp.linear_fc2.weight +model.visual.blocks.16.mlp.linear_fc2.bias +model.visual.blocks.17.norm1.weight +model.visual.blocks.17.norm1.bias +model.visual.blocks.17.norm2.weight +model.visual.blocks.17.norm2.bias +model.visual.blocks.17.attn.qkv.weight +model.visual.blocks.17.attn.qkv.bias +model.visual.blocks.17.attn.proj.weight +model.visual.blocks.17.attn.proj.bias +model.visual.blocks.17.mlp.linear_fc1.weight +model.visual.blocks.17.mlp.linear_fc1.bias +model.visual.blocks.17.mlp.linear_fc2.weight +model.visual.blocks.17.mlp.linear_fc2.bias +model.visual.blocks.18.norm1.weight +model.visual.blocks.18.norm1.bias +model.visual.blocks.18.norm2.weight +model.visual.blocks.18.norm2.bias +model.visual.blocks.18.attn.qkv.weight +model.visual.blocks.18.attn.qkv.bias +model.visual.blocks.18.attn.proj.weight +model.visual.blocks.18.attn.proj.bias +model.visual.blocks.18.mlp.linear_fc1.weight +model.visual.blocks.18.mlp.linear_fc1.bias +model.visual.blocks.18.mlp.linear_fc2.weight +model.visual.blocks.18.mlp.linear_fc2.bias +model.visual.blocks.19.norm1.weight +model.visual.blocks.19.norm1.bias +model.visual.blocks.19.norm2.weight +model.visual.blocks.19.norm2.bias +model.visual.blocks.19.attn.qkv.weight +model.visual.blocks.19.attn.qkv.bias +model.visual.blocks.19.attn.proj.weight +model.visual.blocks.19.attn.proj.bias +model.visual.blocks.19.mlp.linear_fc1.weight +model.visual.blocks.19.mlp.linear_fc1.bias +model.visual.blocks.19.mlp.linear_fc2.weight +model.visual.blocks.19.mlp.linear_fc2.bias +model.visual.blocks.20.norm1.weight +model.visual.blocks.20.norm1.bias +model.visual.blocks.20.norm2.weight +model.visual.blocks.20.norm2.bias +model.visual.blocks.20.attn.qkv.weight +model.visual.blocks.20.attn.qkv.bias +model.visual.blocks.20.attn.proj.weight +model.visual.blocks.20.attn.proj.bias +model.visual.blocks.20.mlp.linear_fc1.weight +model.visual.blocks.20.mlp.linear_fc1.bias +model.visual.blocks.20.mlp.linear_fc2.weight +model.visual.blocks.20.mlp.linear_fc2.bias +model.visual.blocks.21.norm1.weight +model.visual.blocks.21.norm1.bias +model.visual.blocks.21.norm2.weight +model.visual.blocks.21.norm2.bias +model.visual.blocks.21.attn.qkv.weight +model.visual.blocks.21.attn.qkv.bias +model.visual.blocks.21.attn.proj.weight +model.visual.blocks.21.attn.proj.bias +model.visual.blocks.21.mlp.linear_fc1.weight +model.visual.blocks.21.mlp.linear_fc1.bias +model.visual.blocks.21.mlp.linear_fc2.weight +model.visual.blocks.21.mlp.linear_fc2.bias +model.visual.blocks.22.norm1.weight +model.visual.blocks.22.norm1.bias +model.visual.blocks.22.norm2.weight +model.visual.blocks.22.norm2.bias +model.visual.blocks.22.attn.qkv.weight +model.visual.blocks.22.attn.qkv.bias +model.visual.blocks.22.attn.proj.weight +model.visual.blocks.22.attn.proj.bias +model.visual.blocks.22.mlp.linear_fc1.weight +model.visual.blocks.22.mlp.linear_fc1.bias +model.visual.blocks.22.mlp.linear_fc2.weight +model.visual.blocks.22.mlp.linear_fc2.bias +model.visual.blocks.23.norm1.weight +model.visual.blocks.23.norm1.bias +model.visual.blocks.23.norm2.weight +model.visual.blocks.23.norm2.bias +model.visual.blocks.23.attn.qkv.weight +model.visual.blocks.23.attn.qkv.bias +model.visual.blocks.23.attn.proj.weight +model.visual.blocks.23.attn.proj.bias +model.visual.blocks.23.mlp.linear_fc1.weight +model.visual.blocks.23.mlp.linear_fc1.bias +model.visual.blocks.23.mlp.linear_fc2.weight +model.visual.blocks.23.mlp.linear_fc2.bias +model.visual.blocks.24.norm1.weight +model.visual.blocks.24.norm1.bias +model.visual.blocks.24.norm2.weight +model.visual.blocks.24.norm2.bias +model.visual.blocks.24.attn.qkv.weight +model.visual.blocks.24.attn.qkv.bias +model.visual.blocks.24.attn.proj.weight +model.visual.blocks.24.attn.proj.bias +model.visual.blocks.24.mlp.linear_fc1.weight +model.visual.blocks.24.mlp.linear_fc1.bias +model.visual.blocks.24.mlp.linear_fc2.weight +model.visual.blocks.24.mlp.linear_fc2.bias +model.visual.blocks.25.norm1.weight +model.visual.blocks.25.norm1.bias +model.visual.blocks.25.norm2.weight +model.visual.blocks.25.norm2.bias +model.visual.blocks.25.attn.qkv.weight +model.visual.blocks.25.attn.qkv.bias +model.visual.blocks.25.attn.proj.weight +model.visual.blocks.25.attn.proj.bias +model.visual.blocks.25.mlp.linear_fc1.weight +model.visual.blocks.25.mlp.linear_fc1.bias +model.visual.blocks.25.mlp.linear_fc2.weight +model.visual.blocks.25.mlp.linear_fc2.bias +model.visual.blocks.26.norm1.weight +model.visual.blocks.26.norm1.bias +model.visual.blocks.26.norm2.weight +model.visual.blocks.26.norm2.bias +model.visual.blocks.26.attn.qkv.weight +model.visual.blocks.26.attn.qkv.bias +model.visual.blocks.26.attn.proj.weight +model.visual.blocks.26.attn.proj.bias +model.visual.blocks.26.mlp.linear_fc1.weight +model.visual.blocks.26.mlp.linear_fc1.bias +model.visual.blocks.26.mlp.linear_fc2.weight +model.visual.blocks.26.mlp.linear_fc2.bias +model.visual.merger.norm.weight +model.visual.merger.norm.bias +model.visual.merger.linear_fc1.weight +model.visual.merger.linear_fc1.bias +model.visual.merger.linear_fc2.weight +model.visual.merger.linear_fc2.bias +model.visual.deepstack_merger_list.0.norm.weight +model.visual.deepstack_merger_list.0.norm.bias +model.visual.deepstack_merger_list.0.linear_fc1.weight +model.visual.deepstack_merger_list.0.linear_fc1.bias +model.visual.deepstack_merger_list.0.linear_fc2.weight +model.visual.deepstack_merger_list.0.linear_fc2.bias +model.visual.deepstack_merger_list.1.norm.weight +model.visual.deepstack_merger_list.1.norm.bias +model.visual.deepstack_merger_list.1.linear_fc1.weight +model.visual.deepstack_merger_list.1.linear_fc1.bias +model.visual.deepstack_merger_list.1.linear_fc2.weight +model.visual.deepstack_merger_list.1.linear_fc2.bias +model.visual.deepstack_merger_list.2.norm.weight +model.visual.deepstack_merger_list.2.norm.bias +model.visual.deepstack_merger_list.2.linear_fc1.weight +model.visual.deepstack_merger_list.2.linear_fc1.bias +model.visual.deepstack_merger_list.2.linear_fc2.weight +model.visual.deepstack_merger_list.2.linear_fc2.bias +model.language_model.embed_tokens.weight +model.language_model.layers.0.self_attn.q_proj.weight +model.language_model.layers.0.self_attn.k_proj.weight +model.language_model.layers.0.self_attn.v_proj.weight +model.language_model.layers.0.self_attn.o_proj.weight +model.language_model.layers.0.self_attn.q_norm.weight +model.language_model.layers.0.self_attn.k_norm.weight +model.language_model.layers.0.mlp.gate.weight +model.language_model.layers.0.mlp.experts.gate_up_proj +model.language_model.layers.0.mlp.experts.down_proj +model.language_model.layers.0.input_layernorm.weight +model.language_model.layers.0.post_attention_layernorm.weight +model.language_model.layers.1.self_attn.q_proj.weight +model.language_model.layers.1.self_attn.k_proj.weight +model.language_model.layers.1.self_attn.v_proj.weight +model.language_model.layers.1.self_attn.o_proj.weight +model.language_model.layers.1.self_attn.q_norm.weight +model.language_model.layers.1.self_attn.k_norm.weight +model.language_model.layers.1.mlp.gate.weight +model.language_model.layers.1.mlp.experts.gate_up_proj +model.language_model.layers.1.mlp.experts.down_proj +model.language_model.layers.1.input_layernorm.weight +model.language_model.layers.1.post_attention_layernorm.weight +model.language_model.layers.2.self_attn.q_proj.weight +model.language_model.layers.2.self_attn.k_proj.weight +model.language_model.layers.2.self_attn.v_proj.weight +model.language_model.layers.2.self_attn.o_proj.weight +model.language_model.layers.2.self_attn.q_norm.weight +model.language_model.layers.2.self_attn.k_norm.weight +model.language_model.layers.2.mlp.gate.weight +model.language_model.layers.2.mlp.experts.gate_up_proj +model.language_model.layers.2.mlp.experts.down_proj +model.language_model.layers.2.input_layernorm.weight +model.language_model.layers.2.post_attention_layernorm.weight +model.language_model.layers.3.self_attn.q_proj.weight +model.language_model.layers.3.self_attn.k_proj.weight +model.language_model.layers.3.self_attn.v_proj.weight +model.language_model.layers.3.self_attn.o_proj.weight +model.language_model.layers.3.self_attn.q_norm.weight +model.language_model.layers.3.self_attn.k_norm.weight +model.language_model.layers.3.mlp.gate.weight +model.language_model.layers.3.mlp.experts.gate_up_proj +model.language_model.layers.3.mlp.experts.down_proj +model.language_model.layers.3.input_layernorm.weight +model.language_model.layers.3.post_attention_layernorm.weight +model.language_model.layers.4.self_attn.q_proj.weight +model.language_model.layers.4.self_attn.k_proj.weight +model.language_model.layers.4.self_attn.v_proj.weight +model.language_model.layers.4.self_attn.o_proj.weight +model.language_model.layers.4.self_attn.q_norm.weight +model.language_model.layers.4.self_attn.k_norm.weight +model.language_model.layers.4.mlp.gate.weight +model.language_model.layers.4.mlp.experts.gate_up_proj +model.language_model.layers.4.mlp.experts.down_proj +model.language_model.layers.4.input_layernorm.weight +model.language_model.layers.4.post_attention_layernorm.weight +model.language_model.layers.5.self_attn.q_proj.weight +model.language_model.layers.5.self_attn.k_proj.weight +model.language_model.layers.5.self_attn.v_proj.weight +model.language_model.layers.5.self_attn.o_proj.weight +model.language_model.layers.5.self_attn.q_norm.weight +model.language_model.layers.5.self_attn.k_norm.weight +model.language_model.layers.5.mlp.gate.weight +model.language_model.layers.5.mlp.experts.gate_up_proj +model.language_model.layers.5.mlp.experts.down_proj +model.language_model.layers.5.input_layernorm.weight +model.language_model.layers.5.post_attention_layernorm.weight +model.language_model.layers.6.self_attn.q_proj.weight +model.language_model.layers.6.self_attn.k_proj.weight +model.language_model.layers.6.self_attn.v_proj.weight +model.language_model.layers.6.self_attn.o_proj.weight +model.language_model.layers.6.self_attn.q_norm.weight +model.language_model.layers.6.self_attn.k_norm.weight +model.language_model.layers.6.mlp.gate.weight +model.language_model.layers.6.mlp.experts.gate_up_proj +model.language_model.layers.6.mlp.experts.down_proj +model.language_model.layers.6.input_layernorm.weight +model.language_model.layers.6.post_attention_layernorm.weight +model.language_model.layers.7.self_attn.q_proj.weight +model.language_model.layers.7.self_attn.k_proj.weight +model.language_model.layers.7.self_attn.v_proj.weight +model.language_model.layers.7.self_attn.o_proj.weight +model.language_model.layers.7.self_attn.q_norm.weight +model.language_model.layers.7.self_attn.k_norm.weight +model.language_model.layers.7.mlp.gate.weight +model.language_model.layers.7.mlp.experts.gate_up_proj +model.language_model.layers.7.mlp.experts.down_proj +model.language_model.layers.7.input_layernorm.weight +model.language_model.layers.7.post_attention_layernorm.weight +model.language_model.layers.8.self_attn.q_proj.weight +model.language_model.layers.8.self_attn.k_proj.weight +model.language_model.layers.8.self_attn.v_proj.weight +model.language_model.layers.8.self_attn.o_proj.weight +model.language_model.layers.8.self_attn.q_norm.weight +model.language_model.layers.8.self_attn.k_norm.weight +model.language_model.layers.8.mlp.gate.weight +model.language_model.layers.8.mlp.experts.gate_up_proj +model.language_model.layers.8.mlp.experts.down_proj +model.language_model.layers.8.input_layernorm.weight +model.language_model.layers.8.post_attention_layernorm.weight +model.language_model.layers.9.self_attn.q_proj.weight +model.language_model.layers.9.self_attn.k_proj.weight +model.language_model.layers.9.self_attn.v_proj.weight +model.language_model.layers.9.self_attn.o_proj.weight +model.language_model.layers.9.self_attn.q_norm.weight +model.language_model.layers.9.self_attn.k_norm.weight +model.language_model.layers.9.mlp.gate.weight +model.language_model.layers.9.mlp.experts.gate_up_proj +model.language_model.layers.9.mlp.experts.down_proj +model.language_model.layers.9.input_layernorm.weight +model.language_model.layers.9.post_attention_layernorm.weight +model.language_model.layers.10.self_attn.q_proj.weight +model.language_model.layers.10.self_attn.k_proj.weight +model.language_model.layers.10.self_attn.v_proj.weight +model.language_model.layers.10.self_attn.o_proj.weight +model.language_model.layers.10.self_attn.q_norm.weight +model.language_model.layers.10.self_attn.k_norm.weight +model.language_model.layers.10.mlp.gate.weight +model.language_model.layers.10.mlp.experts.gate_up_proj +model.language_model.layers.10.mlp.experts.down_proj +model.language_model.layers.10.input_layernorm.weight +model.language_model.layers.10.post_attention_layernorm.weight +model.language_model.layers.11.self_attn.q_proj.weight +model.language_model.layers.11.self_attn.k_proj.weight +model.language_model.layers.11.self_attn.v_proj.weight +model.language_model.layers.11.self_attn.o_proj.weight +model.language_model.layers.11.self_attn.q_norm.weight +model.language_model.layers.11.self_attn.k_norm.weight +model.language_model.layers.11.mlp.gate.weight +model.language_model.layers.11.mlp.experts.gate_up_proj +model.language_model.layers.11.mlp.experts.down_proj +model.language_model.layers.11.input_layernorm.weight +model.language_model.layers.11.post_attention_layernorm.weight +model.language_model.layers.12.self_attn.q_proj.weight +model.language_model.layers.12.self_attn.k_proj.weight +model.language_model.layers.12.self_attn.v_proj.weight +model.language_model.layers.12.self_attn.o_proj.weight +model.language_model.layers.12.self_attn.q_norm.weight +model.language_model.layers.12.self_attn.k_norm.weight +model.language_model.layers.12.mlp.gate.weight +model.language_model.layers.12.mlp.experts.gate_up_proj +model.language_model.layers.12.mlp.experts.down_proj +model.language_model.layers.12.input_layernorm.weight +model.language_model.layers.12.post_attention_layernorm.weight +model.language_model.layers.13.self_attn.q_proj.weight +model.language_model.layers.13.self_attn.k_proj.weight +model.language_model.layers.13.self_attn.v_proj.weight +model.language_model.layers.13.self_attn.o_proj.weight +model.language_model.layers.13.self_attn.q_norm.weight +model.language_model.layers.13.self_attn.k_norm.weight +model.language_model.layers.13.mlp.gate.weight +model.language_model.layers.13.mlp.experts.gate_up_proj +model.language_model.layers.13.mlp.experts.down_proj +model.language_model.layers.13.input_layernorm.weight +model.language_model.layers.13.post_attention_layernorm.weight +model.language_model.layers.14.self_attn.q_proj.weight +model.language_model.layers.14.self_attn.k_proj.weight +model.language_model.layers.14.self_attn.v_proj.weight +model.language_model.layers.14.self_attn.o_proj.weight +model.language_model.layers.14.self_attn.q_norm.weight +model.language_model.layers.14.self_attn.k_norm.weight +model.language_model.layers.14.mlp.gate.weight +model.language_model.layers.14.mlp.experts.gate_up_proj +model.language_model.layers.14.mlp.experts.down_proj +model.language_model.layers.14.input_layernorm.weight +model.language_model.layers.14.post_attention_layernorm.weight +model.language_model.layers.15.self_attn.q_proj.weight +model.language_model.layers.15.self_attn.k_proj.weight +model.language_model.layers.15.self_attn.v_proj.weight +model.language_model.layers.15.self_attn.o_proj.weight +model.language_model.layers.15.self_attn.q_norm.weight +model.language_model.layers.15.self_attn.k_norm.weight +model.language_model.layers.15.mlp.gate.weight +model.language_model.layers.15.mlp.experts.gate_up_proj +model.language_model.layers.15.mlp.experts.down_proj +model.language_model.layers.15.input_layernorm.weight +model.language_model.layers.15.post_attention_layernorm.weight +model.language_model.layers.16.self_attn.q_proj.weight +model.language_model.layers.16.self_attn.k_proj.weight +model.language_model.layers.16.self_attn.v_proj.weight +model.language_model.layers.16.self_attn.o_proj.weight +model.language_model.layers.16.self_attn.q_norm.weight +model.language_model.layers.16.self_attn.k_norm.weight +model.language_model.layers.16.mlp.gate.weight +model.language_model.layers.16.mlp.experts.gate_up_proj +model.language_model.layers.16.mlp.experts.down_proj +model.language_model.layers.16.input_layernorm.weight +model.language_model.layers.16.post_attention_layernorm.weight +model.language_model.layers.17.self_attn.q_proj.weight +model.language_model.layers.17.self_attn.k_proj.weight +model.language_model.layers.17.self_attn.v_proj.weight +model.language_model.layers.17.self_attn.o_proj.weight +model.language_model.layers.17.self_attn.q_norm.weight +model.language_model.layers.17.self_attn.k_norm.weight +model.language_model.layers.17.mlp.gate.weight +model.language_model.layers.17.mlp.experts.gate_up_proj +model.language_model.layers.17.mlp.experts.down_proj +model.language_model.layers.17.input_layernorm.weight +model.language_model.layers.17.post_attention_layernorm.weight +model.language_model.layers.18.self_attn.q_proj.weight +model.language_model.layers.18.self_attn.k_proj.weight +model.language_model.layers.18.self_attn.v_proj.weight +model.language_model.layers.18.self_attn.o_proj.weight +model.language_model.layers.18.self_attn.q_norm.weight +model.language_model.layers.18.self_attn.k_norm.weight +model.language_model.layers.18.mlp.gate.weight +model.language_model.layers.18.mlp.experts.gate_up_proj +model.language_model.layers.18.mlp.experts.down_proj +model.language_model.layers.18.input_layernorm.weight +model.language_model.layers.18.post_attention_layernorm.weight +model.language_model.layers.19.self_attn.q_proj.weight +model.language_model.layers.19.self_attn.k_proj.weight +model.language_model.layers.19.self_attn.v_proj.weight +model.language_model.layers.19.self_attn.o_proj.weight +model.language_model.layers.19.self_attn.q_norm.weight +model.language_model.layers.19.self_attn.k_norm.weight +model.language_model.layers.19.mlp.gate.weight +model.language_model.layers.19.mlp.experts.gate_up_proj +model.language_model.layers.19.mlp.experts.down_proj +model.language_model.layers.19.input_layernorm.weight +model.language_model.layers.19.post_attention_layernorm.weight +model.language_model.layers.20.self_attn.q_proj.weight +model.language_model.layers.20.self_attn.k_proj.weight +model.language_model.layers.20.self_attn.v_proj.weight +model.language_model.layers.20.self_attn.o_proj.weight +model.language_model.layers.20.self_attn.q_norm.weight +model.language_model.layers.20.self_attn.k_norm.weight +model.language_model.layers.20.mlp.gate.weight +model.language_model.layers.20.mlp.experts.gate_up_proj +model.language_model.layers.20.mlp.experts.down_proj +model.language_model.layers.20.input_layernorm.weight +model.language_model.layers.20.post_attention_layernorm.weight +model.language_model.layers.21.self_attn.q_proj.weight +model.language_model.layers.21.self_attn.k_proj.weight +model.language_model.layers.21.self_attn.v_proj.weight +model.language_model.layers.21.self_attn.o_proj.weight +model.language_model.layers.21.self_attn.q_norm.weight +model.language_model.layers.21.self_attn.k_norm.weight +model.language_model.layers.21.mlp.gate.weight +model.language_model.layers.21.mlp.experts.gate_up_proj +model.language_model.layers.21.mlp.experts.down_proj +model.language_model.layers.21.input_layernorm.weight +model.language_model.layers.21.post_attention_layernorm.weight +model.language_model.layers.22.self_attn.q_proj.weight +model.language_model.layers.22.self_attn.k_proj.weight +model.language_model.layers.22.self_attn.v_proj.weight +model.language_model.layers.22.self_attn.o_proj.weight +model.language_model.layers.22.self_attn.q_norm.weight +model.language_model.layers.22.self_attn.k_norm.weight +model.language_model.layers.22.mlp.gate.weight +model.language_model.layers.22.mlp.experts.gate_up_proj +model.language_model.layers.22.mlp.experts.down_proj +model.language_model.layers.22.input_layernorm.weight +model.language_model.layers.22.post_attention_layernorm.weight +model.language_model.layers.23.self_attn.q_proj.weight +model.language_model.layers.23.self_attn.k_proj.weight +model.language_model.layers.23.self_attn.v_proj.weight +model.language_model.layers.23.self_attn.o_proj.weight +model.language_model.layers.23.self_attn.q_norm.weight +model.language_model.layers.23.self_attn.k_norm.weight +model.language_model.layers.23.mlp.gate.weight +model.language_model.layers.23.mlp.experts.gate_up_proj +model.language_model.layers.23.mlp.experts.down_proj +model.language_model.layers.23.input_layernorm.weight +model.language_model.layers.23.post_attention_layernorm.weight +model.language_model.layers.24.self_attn.q_proj.weight +model.language_model.layers.24.self_attn.k_proj.weight +model.language_model.layers.24.self_attn.v_proj.weight +model.language_model.layers.24.self_attn.o_proj.weight +model.language_model.layers.24.self_attn.q_norm.weight +model.language_model.layers.24.self_attn.k_norm.weight +model.language_model.layers.24.mlp.gate.weight +model.language_model.layers.24.mlp.experts.gate_up_proj +model.language_model.layers.24.mlp.experts.down_proj +model.language_model.layers.24.input_layernorm.weight +model.language_model.layers.24.post_attention_layernorm.weight +model.language_model.layers.25.self_attn.q_proj.weight +model.language_model.layers.25.self_attn.k_proj.weight +model.language_model.layers.25.self_attn.v_proj.weight +model.language_model.layers.25.self_attn.o_proj.weight +model.language_model.layers.25.self_attn.q_norm.weight +model.language_model.layers.25.self_attn.k_norm.weight +model.language_model.layers.25.mlp.gate.weight +model.language_model.layers.25.mlp.experts.gate_up_proj +model.language_model.layers.25.mlp.experts.down_proj +model.language_model.layers.25.input_layernorm.weight +model.language_model.layers.25.post_attention_layernorm.weight +model.language_model.layers.26.self_attn.q_proj.weight +model.language_model.layers.26.self_attn.k_proj.weight +model.language_model.layers.26.self_attn.v_proj.weight +model.language_model.layers.26.self_attn.o_proj.weight +model.language_model.layers.26.self_attn.q_norm.weight +model.language_model.layers.26.self_attn.k_norm.weight +model.language_model.layers.26.mlp.gate.weight +model.language_model.layers.26.mlp.experts.gate_up_proj +model.language_model.layers.26.mlp.experts.down_proj +model.language_model.layers.26.input_layernorm.weight +model.language_model.layers.26.post_attention_layernorm.weight +model.language_model.layers.27.self_attn.q_proj.weight +model.language_model.layers.27.self_attn.k_proj.weight +model.language_model.layers.27.self_attn.v_proj.weight +model.language_model.layers.27.self_attn.o_proj.weight +model.language_model.layers.27.self_attn.q_norm.weight +model.language_model.layers.27.self_attn.k_norm.weight +model.language_model.layers.27.mlp.gate.weight +model.language_model.layers.27.mlp.experts.gate_up_proj +model.language_model.layers.27.mlp.experts.down_proj +model.language_model.layers.27.input_layernorm.weight +model.language_model.layers.27.post_attention_layernorm.weight +model.language_model.layers.28.self_attn.q_proj.weight +model.language_model.layers.28.self_attn.k_proj.weight +model.language_model.layers.28.self_attn.v_proj.weight +model.language_model.layers.28.self_attn.o_proj.weight +model.language_model.layers.28.self_attn.q_norm.weight +model.language_model.layers.28.self_attn.k_norm.weight +model.language_model.layers.28.mlp.gate.weight +model.language_model.layers.28.mlp.experts.gate_up_proj +model.language_model.layers.28.mlp.experts.down_proj +model.language_model.layers.28.input_layernorm.weight +model.language_model.layers.28.post_attention_layernorm.weight +model.language_model.layers.29.self_attn.q_proj.weight +model.language_model.layers.29.self_attn.k_proj.weight +model.language_model.layers.29.self_attn.v_proj.weight +model.language_model.layers.29.self_attn.o_proj.weight +model.language_model.layers.29.self_attn.q_norm.weight +model.language_model.layers.29.self_attn.k_norm.weight +model.language_model.layers.29.mlp.gate.weight +model.language_model.layers.29.mlp.experts.gate_up_proj +model.language_model.layers.29.mlp.experts.down_proj +model.language_model.layers.29.input_layernorm.weight +model.language_model.layers.29.post_attention_layernorm.weight +model.language_model.layers.30.self_attn.q_proj.weight +model.language_model.layers.30.self_attn.k_proj.weight +model.language_model.layers.30.self_attn.v_proj.weight +model.language_model.layers.30.self_attn.o_proj.weight +model.language_model.layers.30.self_attn.q_norm.weight +model.language_model.layers.30.self_attn.k_norm.weight +model.language_model.layers.30.mlp.gate.weight +model.language_model.layers.30.mlp.experts.gate_up_proj +model.language_model.layers.30.mlp.experts.down_proj +model.language_model.layers.30.input_layernorm.weight +model.language_model.layers.30.post_attention_layernorm.weight +model.language_model.layers.31.self_attn.q_proj.weight +model.language_model.layers.31.self_attn.k_proj.weight +model.language_model.layers.31.self_attn.v_proj.weight +model.language_model.layers.31.self_attn.o_proj.weight +model.language_model.layers.31.self_attn.q_norm.weight +model.language_model.layers.31.self_attn.k_norm.weight +model.language_model.layers.31.mlp.gate.weight +model.language_model.layers.31.mlp.experts.gate_up_proj +model.language_model.layers.31.mlp.experts.down_proj +model.language_model.layers.31.input_layernorm.weight +model.language_model.layers.31.post_attention_layernorm.weight +model.language_model.layers.32.self_attn.q_proj.weight +model.language_model.layers.32.self_attn.k_proj.weight +model.language_model.layers.32.self_attn.v_proj.weight +model.language_model.layers.32.self_attn.o_proj.weight +model.language_model.layers.32.self_attn.q_norm.weight +model.language_model.layers.32.self_attn.k_norm.weight +model.language_model.layers.32.mlp.gate.weight +model.language_model.layers.32.mlp.experts.gate_up_proj +model.language_model.layers.32.mlp.experts.down_proj +model.language_model.layers.32.input_layernorm.weight +model.language_model.layers.32.post_attention_layernorm.weight +model.language_model.layers.33.self_attn.q_proj.weight +model.language_model.layers.33.self_attn.k_proj.weight +model.language_model.layers.33.self_attn.v_proj.weight +model.language_model.layers.33.self_attn.o_proj.weight +model.language_model.layers.33.self_attn.q_norm.weight +model.language_model.layers.33.self_attn.k_norm.weight +model.language_model.layers.33.mlp.gate.weight +model.language_model.layers.33.mlp.experts.gate_up_proj +model.language_model.layers.33.mlp.experts.down_proj +model.language_model.layers.33.input_layernorm.weight +model.language_model.layers.33.post_attention_layernorm.weight +model.language_model.layers.34.self_attn.q_proj.weight +model.language_model.layers.34.self_attn.k_proj.weight +model.language_model.layers.34.self_attn.v_proj.weight +model.language_model.layers.34.self_attn.o_proj.weight +model.language_model.layers.34.self_attn.q_norm.weight +model.language_model.layers.34.self_attn.k_norm.weight +model.language_model.layers.34.mlp.gate.weight +model.language_model.layers.34.mlp.experts.gate_up_proj +model.language_model.layers.34.mlp.experts.down_proj +model.language_model.layers.34.input_layernorm.weight +model.language_model.layers.34.post_attention_layernorm.weight +model.language_model.layers.35.self_attn.q_proj.weight +model.language_model.layers.35.self_attn.k_proj.weight +model.language_model.layers.35.self_attn.v_proj.weight +model.language_model.layers.35.self_attn.o_proj.weight +model.language_model.layers.35.self_attn.q_norm.weight +model.language_model.layers.35.self_attn.k_norm.weight +model.language_model.layers.35.mlp.gate.weight +model.language_model.layers.35.mlp.experts.gate_up_proj +model.language_model.layers.35.mlp.experts.down_proj +model.language_model.layers.35.input_layernorm.weight +model.language_model.layers.35.post_attention_layernorm.weight +model.language_model.layers.36.self_attn.q_proj.weight +model.language_model.layers.36.self_attn.k_proj.weight +model.language_model.layers.36.self_attn.v_proj.weight +model.language_model.layers.36.self_attn.o_proj.weight +model.language_model.layers.36.self_attn.q_norm.weight +model.language_model.layers.36.self_attn.k_norm.weight +model.language_model.layers.36.mlp.gate.weight +model.language_model.layers.36.mlp.experts.gate_up_proj +model.language_model.layers.36.mlp.experts.down_proj +model.language_model.layers.36.input_layernorm.weight +model.language_model.layers.36.post_attention_layernorm.weight +model.language_model.layers.37.self_attn.q_proj.weight +model.language_model.layers.37.self_attn.k_proj.weight +model.language_model.layers.37.self_attn.v_proj.weight +model.language_model.layers.37.self_attn.o_proj.weight +model.language_model.layers.37.self_attn.q_norm.weight +model.language_model.layers.37.self_attn.k_norm.weight +model.language_model.layers.37.mlp.gate.weight +model.language_model.layers.37.mlp.experts.gate_up_proj +model.language_model.layers.37.mlp.experts.down_proj +model.language_model.layers.37.input_layernorm.weight +model.language_model.layers.37.post_attention_layernorm.weight +model.language_model.layers.38.self_attn.q_proj.weight +model.language_model.layers.38.self_attn.k_proj.weight +model.language_model.layers.38.self_attn.v_proj.weight +model.language_model.layers.38.self_attn.o_proj.weight +model.language_model.layers.38.self_attn.q_norm.weight +model.language_model.layers.38.self_attn.k_norm.weight +model.language_model.layers.38.mlp.gate.weight +model.language_model.layers.38.mlp.experts.gate_up_proj +model.language_model.layers.38.mlp.experts.down_proj +model.language_model.layers.38.input_layernorm.weight +model.language_model.layers.38.post_attention_layernorm.weight +model.language_model.layers.39.self_attn.q_proj.weight +model.language_model.layers.39.self_attn.k_proj.weight +model.language_model.layers.39.self_attn.v_proj.weight +model.language_model.layers.39.self_attn.o_proj.weight +model.language_model.layers.39.self_attn.q_norm.weight +model.language_model.layers.39.self_attn.k_norm.weight +model.language_model.layers.39.mlp.gate.weight +model.language_model.layers.39.mlp.experts.gate_up_proj +model.language_model.layers.39.mlp.experts.down_proj +model.language_model.layers.39.input_layernorm.weight +model.language_model.layers.39.post_attention_layernorm.weight +model.language_model.layers.40.self_attn.q_proj.weight +model.language_model.layers.40.self_attn.k_proj.weight +model.language_model.layers.40.self_attn.v_proj.weight +model.language_model.layers.40.self_attn.o_proj.weight +model.language_model.layers.40.self_attn.q_norm.weight +model.language_model.layers.40.self_attn.k_norm.weight +model.language_model.layers.40.mlp.gate.weight +model.language_model.layers.40.mlp.experts.gate_up_proj +model.language_model.layers.40.mlp.experts.down_proj +model.language_model.layers.40.input_layernorm.weight +model.language_model.layers.40.post_attention_layernorm.weight +model.language_model.layers.41.self_attn.q_proj.weight +model.language_model.layers.41.self_attn.k_proj.weight +model.language_model.layers.41.self_attn.v_proj.weight +model.language_model.layers.41.self_attn.o_proj.weight +model.language_model.layers.41.self_attn.q_norm.weight +model.language_model.layers.41.self_attn.k_norm.weight +model.language_model.layers.41.mlp.gate.weight +model.language_model.layers.41.mlp.experts.gate_up_proj +model.language_model.layers.41.mlp.experts.down_proj +model.language_model.layers.41.input_layernorm.weight +model.language_model.layers.41.post_attention_layernorm.weight +model.language_model.layers.42.self_attn.q_proj.weight +model.language_model.layers.42.self_attn.k_proj.weight +model.language_model.layers.42.self_attn.v_proj.weight +model.language_model.layers.42.self_attn.o_proj.weight +model.language_model.layers.42.self_attn.q_norm.weight +model.language_model.layers.42.self_attn.k_norm.weight +model.language_model.layers.42.mlp.gate.weight +model.language_model.layers.42.mlp.experts.gate_up_proj +model.language_model.layers.42.mlp.experts.down_proj +model.language_model.layers.42.input_layernorm.weight +model.language_model.layers.42.post_attention_layernorm.weight +model.language_model.layers.43.self_attn.q_proj.weight +model.language_model.layers.43.self_attn.k_proj.weight +model.language_model.layers.43.self_attn.v_proj.weight +model.language_model.layers.43.self_attn.o_proj.weight +model.language_model.layers.43.self_attn.q_norm.weight +model.language_model.layers.43.self_attn.k_norm.weight +model.language_model.layers.43.mlp.gate.weight +model.language_model.layers.43.mlp.experts.gate_up_proj +model.language_model.layers.43.mlp.experts.down_proj +model.language_model.layers.43.input_layernorm.weight +model.language_model.layers.43.post_attention_layernorm.weight +model.language_model.layers.44.self_attn.q_proj.weight +model.language_model.layers.44.self_attn.k_proj.weight +model.language_model.layers.44.self_attn.v_proj.weight +model.language_model.layers.44.self_attn.o_proj.weight +model.language_model.layers.44.self_attn.q_norm.weight +model.language_model.layers.44.self_attn.k_norm.weight +model.language_model.layers.44.mlp.gate.weight +model.language_model.layers.44.mlp.experts.gate_up_proj +model.language_model.layers.44.mlp.experts.down_proj +model.language_model.layers.44.input_layernorm.weight +model.language_model.layers.44.post_attention_layernorm.weight +model.language_model.layers.45.self_attn.q_proj.weight +model.language_model.layers.45.self_attn.k_proj.weight +model.language_model.layers.45.self_attn.v_proj.weight +model.language_model.layers.45.self_attn.o_proj.weight +model.language_model.layers.45.self_attn.q_norm.weight +model.language_model.layers.45.self_attn.k_norm.weight +model.language_model.layers.45.mlp.gate.weight +model.language_model.layers.45.mlp.experts.gate_up_proj +model.language_model.layers.45.mlp.experts.down_proj +model.language_model.layers.45.input_layernorm.weight +model.language_model.layers.45.post_attention_layernorm.weight +model.language_model.layers.46.self_attn.q_proj.weight +model.language_model.layers.46.self_attn.k_proj.weight +model.language_model.layers.46.self_attn.v_proj.weight +model.language_model.layers.46.self_attn.o_proj.weight +model.language_model.layers.46.self_attn.q_norm.weight +model.language_model.layers.46.self_attn.k_norm.weight +model.language_model.layers.46.mlp.gate.weight +model.language_model.layers.46.mlp.experts.gate_up_proj +model.language_model.layers.46.mlp.experts.down_proj +model.language_model.layers.46.input_layernorm.weight +model.language_model.layers.46.post_attention_layernorm.weight +model.language_model.layers.47.self_attn.q_proj.weight +model.language_model.layers.47.self_attn.k_proj.weight +model.language_model.layers.47.self_attn.v_proj.weight +model.language_model.layers.47.self_attn.o_proj.weight +model.language_model.layers.47.self_attn.q_norm.weight +model.language_model.layers.47.self_attn.k_norm.weight +model.language_model.layers.47.mlp.gate.weight +model.language_model.layers.47.mlp.experts.gate_up_proj +model.language_model.layers.47.mlp.experts.down_proj +model.language_model.layers.47.input_layernorm.weight +model.language_model.layers.47.post_attention_layernorm.weight +model.language_model.norm.weight +lm_head.weight From 00c7566b93d54d1db818bcf5b002b172fc295a04 Mon Sep 17 00:00:00 2001 From: zouxinyi0625 Date: Tue, 14 Oct 2025 10:44:29 +0800 Subject: [PATCH 04/17] update --- model_keys.txt | 0 qwen3vl_model_keys.txt | 882 ----------------------------------------- 2 files changed, 882 deletions(-) delete mode 100644 model_keys.txt delete mode 100644 qwen3vl_model_keys.txt diff --git a/model_keys.txt b/model_keys.txt deleted file mode 100644 index e69de29b..00000000 diff --git a/qwen3vl_model_keys.txt b/qwen3vl_model_keys.txt deleted file mode 100644 index f100c5b8..00000000 --- a/qwen3vl_model_keys.txt +++ /dev/null @@ -1,882 +0,0 @@ -model.visual.patch_embed.proj.weight -model.visual.patch_embed.proj.bias -model.visual.pos_embed.weight -model.visual.blocks.0.norm1.weight -model.visual.blocks.0.norm1.bias -model.visual.blocks.0.norm2.weight -model.visual.blocks.0.norm2.bias -model.visual.blocks.0.attn.qkv.weight -model.visual.blocks.0.attn.qkv.bias -model.visual.blocks.0.attn.proj.weight -model.visual.blocks.0.attn.proj.bias -model.visual.blocks.0.mlp.linear_fc1.weight -model.visual.blocks.0.mlp.linear_fc1.bias -model.visual.blocks.0.mlp.linear_fc2.weight -model.visual.blocks.0.mlp.linear_fc2.bias -model.visual.blocks.1.norm1.weight -model.visual.blocks.1.norm1.bias -model.visual.blocks.1.norm2.weight -model.visual.blocks.1.norm2.bias -model.visual.blocks.1.attn.qkv.weight -model.visual.blocks.1.attn.qkv.bias -model.visual.blocks.1.attn.proj.weight -model.visual.blocks.1.attn.proj.bias -model.visual.blocks.1.mlp.linear_fc1.weight -model.visual.blocks.1.mlp.linear_fc1.bias -model.visual.blocks.1.mlp.linear_fc2.weight -model.visual.blocks.1.mlp.linear_fc2.bias -model.visual.blocks.2.norm1.weight -model.visual.blocks.2.norm1.bias -model.visual.blocks.2.norm2.weight -model.visual.blocks.2.norm2.bias -model.visual.blocks.2.attn.qkv.weight -model.visual.blocks.2.attn.qkv.bias -model.visual.blocks.2.attn.proj.weight -model.visual.blocks.2.attn.proj.bias -model.visual.blocks.2.mlp.linear_fc1.weight -model.visual.blocks.2.mlp.linear_fc1.bias -model.visual.blocks.2.mlp.linear_fc2.weight -model.visual.blocks.2.mlp.linear_fc2.bias -model.visual.blocks.3.norm1.weight -model.visual.blocks.3.norm1.bias -model.visual.blocks.3.norm2.weight -model.visual.blocks.3.norm2.bias -model.visual.blocks.3.attn.qkv.weight -model.visual.blocks.3.attn.qkv.bias -model.visual.blocks.3.attn.proj.weight -model.visual.blocks.3.attn.proj.bias -model.visual.blocks.3.mlp.linear_fc1.weight -model.visual.blocks.3.mlp.linear_fc1.bias -model.visual.blocks.3.mlp.linear_fc2.weight -model.visual.blocks.3.mlp.linear_fc2.bias -model.visual.blocks.4.norm1.weight -model.visual.blocks.4.norm1.bias -model.visual.blocks.4.norm2.weight -model.visual.blocks.4.norm2.bias -model.visual.blocks.4.attn.qkv.weight -model.visual.blocks.4.attn.qkv.bias -model.visual.blocks.4.attn.proj.weight -model.visual.blocks.4.attn.proj.bias -model.visual.blocks.4.mlp.linear_fc1.weight -model.visual.blocks.4.mlp.linear_fc1.bias -model.visual.blocks.4.mlp.linear_fc2.weight -model.visual.blocks.4.mlp.linear_fc2.bias -model.visual.blocks.5.norm1.weight -model.visual.blocks.5.norm1.bias -model.visual.blocks.5.norm2.weight -model.visual.blocks.5.norm2.bias -model.visual.blocks.5.attn.qkv.weight -model.visual.blocks.5.attn.qkv.bias -model.visual.blocks.5.attn.proj.weight -model.visual.blocks.5.attn.proj.bias -model.visual.blocks.5.mlp.linear_fc1.weight -model.visual.blocks.5.mlp.linear_fc1.bias -model.visual.blocks.5.mlp.linear_fc2.weight -model.visual.blocks.5.mlp.linear_fc2.bias -model.visual.blocks.6.norm1.weight -model.visual.blocks.6.norm1.bias -model.visual.blocks.6.norm2.weight -model.visual.blocks.6.norm2.bias -model.visual.blocks.6.attn.qkv.weight -model.visual.blocks.6.attn.qkv.bias -model.visual.blocks.6.attn.proj.weight -model.visual.blocks.6.attn.proj.bias -model.visual.blocks.6.mlp.linear_fc1.weight -model.visual.blocks.6.mlp.linear_fc1.bias -model.visual.blocks.6.mlp.linear_fc2.weight -model.visual.blocks.6.mlp.linear_fc2.bias -model.visual.blocks.7.norm1.weight -model.visual.blocks.7.norm1.bias -model.visual.blocks.7.norm2.weight -model.visual.blocks.7.norm2.bias -model.visual.blocks.7.attn.qkv.weight -model.visual.blocks.7.attn.qkv.bias -model.visual.blocks.7.attn.proj.weight -model.visual.blocks.7.attn.proj.bias -model.visual.blocks.7.mlp.linear_fc1.weight -model.visual.blocks.7.mlp.linear_fc1.bias -model.visual.blocks.7.mlp.linear_fc2.weight -model.visual.blocks.7.mlp.linear_fc2.bias -model.visual.blocks.8.norm1.weight -model.visual.blocks.8.norm1.bias -model.visual.blocks.8.norm2.weight -model.visual.blocks.8.norm2.bias -model.visual.blocks.8.attn.qkv.weight -model.visual.blocks.8.attn.qkv.bias -model.visual.blocks.8.attn.proj.weight -model.visual.blocks.8.attn.proj.bias -model.visual.blocks.8.mlp.linear_fc1.weight -model.visual.blocks.8.mlp.linear_fc1.bias -model.visual.blocks.8.mlp.linear_fc2.weight -model.visual.blocks.8.mlp.linear_fc2.bias -model.visual.blocks.9.norm1.weight -model.visual.blocks.9.norm1.bias -model.visual.blocks.9.norm2.weight -model.visual.blocks.9.norm2.bias -model.visual.blocks.9.attn.qkv.weight -model.visual.blocks.9.attn.qkv.bias -model.visual.blocks.9.attn.proj.weight -model.visual.blocks.9.attn.proj.bias -model.visual.blocks.9.mlp.linear_fc1.weight -model.visual.blocks.9.mlp.linear_fc1.bias -model.visual.blocks.9.mlp.linear_fc2.weight -model.visual.blocks.9.mlp.linear_fc2.bias -model.visual.blocks.10.norm1.weight -model.visual.blocks.10.norm1.bias -model.visual.blocks.10.norm2.weight -model.visual.blocks.10.norm2.bias -model.visual.blocks.10.attn.qkv.weight -model.visual.blocks.10.attn.qkv.bias -model.visual.blocks.10.attn.proj.weight -model.visual.blocks.10.attn.proj.bias -model.visual.blocks.10.mlp.linear_fc1.weight -model.visual.blocks.10.mlp.linear_fc1.bias -model.visual.blocks.10.mlp.linear_fc2.weight -model.visual.blocks.10.mlp.linear_fc2.bias -model.visual.blocks.11.norm1.weight -model.visual.blocks.11.norm1.bias -model.visual.blocks.11.norm2.weight -model.visual.blocks.11.norm2.bias -model.visual.blocks.11.attn.qkv.weight -model.visual.blocks.11.attn.qkv.bias -model.visual.blocks.11.attn.proj.weight -model.visual.blocks.11.attn.proj.bias -model.visual.blocks.11.mlp.linear_fc1.weight -model.visual.blocks.11.mlp.linear_fc1.bias -model.visual.blocks.11.mlp.linear_fc2.weight -model.visual.blocks.11.mlp.linear_fc2.bias -model.visual.blocks.12.norm1.weight -model.visual.blocks.12.norm1.bias -model.visual.blocks.12.norm2.weight -model.visual.blocks.12.norm2.bias -model.visual.blocks.12.attn.qkv.weight -model.visual.blocks.12.attn.qkv.bias -model.visual.blocks.12.attn.proj.weight -model.visual.blocks.12.attn.proj.bias -model.visual.blocks.12.mlp.linear_fc1.weight -model.visual.blocks.12.mlp.linear_fc1.bias -model.visual.blocks.12.mlp.linear_fc2.weight -model.visual.blocks.12.mlp.linear_fc2.bias -model.visual.blocks.13.norm1.weight -model.visual.blocks.13.norm1.bias -model.visual.blocks.13.norm2.weight -model.visual.blocks.13.norm2.bias -model.visual.blocks.13.attn.qkv.weight -model.visual.blocks.13.attn.qkv.bias -model.visual.blocks.13.attn.proj.weight -model.visual.blocks.13.attn.proj.bias -model.visual.blocks.13.mlp.linear_fc1.weight -model.visual.blocks.13.mlp.linear_fc1.bias -model.visual.blocks.13.mlp.linear_fc2.weight -model.visual.blocks.13.mlp.linear_fc2.bias -model.visual.blocks.14.norm1.weight -model.visual.blocks.14.norm1.bias -model.visual.blocks.14.norm2.weight -model.visual.blocks.14.norm2.bias -model.visual.blocks.14.attn.qkv.weight -model.visual.blocks.14.attn.qkv.bias -model.visual.blocks.14.attn.proj.weight -model.visual.blocks.14.attn.proj.bias -model.visual.blocks.14.mlp.linear_fc1.weight -model.visual.blocks.14.mlp.linear_fc1.bias -model.visual.blocks.14.mlp.linear_fc2.weight -model.visual.blocks.14.mlp.linear_fc2.bias -model.visual.blocks.15.norm1.weight -model.visual.blocks.15.norm1.bias -model.visual.blocks.15.norm2.weight -model.visual.blocks.15.norm2.bias -model.visual.blocks.15.attn.qkv.weight -model.visual.blocks.15.attn.qkv.bias -model.visual.blocks.15.attn.proj.weight -model.visual.blocks.15.attn.proj.bias -model.visual.blocks.15.mlp.linear_fc1.weight -model.visual.blocks.15.mlp.linear_fc1.bias -model.visual.blocks.15.mlp.linear_fc2.weight -model.visual.blocks.15.mlp.linear_fc2.bias -model.visual.blocks.16.norm1.weight -model.visual.blocks.16.norm1.bias -model.visual.blocks.16.norm2.weight -model.visual.blocks.16.norm2.bias -model.visual.blocks.16.attn.qkv.weight -model.visual.blocks.16.attn.qkv.bias -model.visual.blocks.16.attn.proj.weight -model.visual.blocks.16.attn.proj.bias -model.visual.blocks.16.mlp.linear_fc1.weight -model.visual.blocks.16.mlp.linear_fc1.bias -model.visual.blocks.16.mlp.linear_fc2.weight -model.visual.blocks.16.mlp.linear_fc2.bias -model.visual.blocks.17.norm1.weight -model.visual.blocks.17.norm1.bias -model.visual.blocks.17.norm2.weight -model.visual.blocks.17.norm2.bias -model.visual.blocks.17.attn.qkv.weight -model.visual.blocks.17.attn.qkv.bias -model.visual.blocks.17.attn.proj.weight -model.visual.blocks.17.attn.proj.bias -model.visual.blocks.17.mlp.linear_fc1.weight -model.visual.blocks.17.mlp.linear_fc1.bias -model.visual.blocks.17.mlp.linear_fc2.weight -model.visual.blocks.17.mlp.linear_fc2.bias -model.visual.blocks.18.norm1.weight -model.visual.blocks.18.norm1.bias -model.visual.blocks.18.norm2.weight -model.visual.blocks.18.norm2.bias -model.visual.blocks.18.attn.qkv.weight -model.visual.blocks.18.attn.qkv.bias -model.visual.blocks.18.attn.proj.weight -model.visual.blocks.18.attn.proj.bias -model.visual.blocks.18.mlp.linear_fc1.weight -model.visual.blocks.18.mlp.linear_fc1.bias -model.visual.blocks.18.mlp.linear_fc2.weight -model.visual.blocks.18.mlp.linear_fc2.bias -model.visual.blocks.19.norm1.weight -model.visual.blocks.19.norm1.bias -model.visual.blocks.19.norm2.weight -model.visual.blocks.19.norm2.bias -model.visual.blocks.19.attn.qkv.weight -model.visual.blocks.19.attn.qkv.bias -model.visual.blocks.19.attn.proj.weight -model.visual.blocks.19.attn.proj.bias -model.visual.blocks.19.mlp.linear_fc1.weight -model.visual.blocks.19.mlp.linear_fc1.bias -model.visual.blocks.19.mlp.linear_fc2.weight -model.visual.blocks.19.mlp.linear_fc2.bias -model.visual.blocks.20.norm1.weight -model.visual.blocks.20.norm1.bias -model.visual.blocks.20.norm2.weight -model.visual.blocks.20.norm2.bias -model.visual.blocks.20.attn.qkv.weight -model.visual.blocks.20.attn.qkv.bias -model.visual.blocks.20.attn.proj.weight -model.visual.blocks.20.attn.proj.bias -model.visual.blocks.20.mlp.linear_fc1.weight -model.visual.blocks.20.mlp.linear_fc1.bias -model.visual.blocks.20.mlp.linear_fc2.weight -model.visual.blocks.20.mlp.linear_fc2.bias -model.visual.blocks.21.norm1.weight -model.visual.blocks.21.norm1.bias -model.visual.blocks.21.norm2.weight -model.visual.blocks.21.norm2.bias -model.visual.blocks.21.attn.qkv.weight -model.visual.blocks.21.attn.qkv.bias -model.visual.blocks.21.attn.proj.weight -model.visual.blocks.21.attn.proj.bias -model.visual.blocks.21.mlp.linear_fc1.weight -model.visual.blocks.21.mlp.linear_fc1.bias -model.visual.blocks.21.mlp.linear_fc2.weight -model.visual.blocks.21.mlp.linear_fc2.bias -model.visual.blocks.22.norm1.weight -model.visual.blocks.22.norm1.bias -model.visual.blocks.22.norm2.weight -model.visual.blocks.22.norm2.bias -model.visual.blocks.22.attn.qkv.weight -model.visual.blocks.22.attn.qkv.bias -model.visual.blocks.22.attn.proj.weight -model.visual.blocks.22.attn.proj.bias -model.visual.blocks.22.mlp.linear_fc1.weight -model.visual.blocks.22.mlp.linear_fc1.bias -model.visual.blocks.22.mlp.linear_fc2.weight -model.visual.blocks.22.mlp.linear_fc2.bias -model.visual.blocks.23.norm1.weight -model.visual.blocks.23.norm1.bias -model.visual.blocks.23.norm2.weight -model.visual.blocks.23.norm2.bias -model.visual.blocks.23.attn.qkv.weight -model.visual.blocks.23.attn.qkv.bias -model.visual.blocks.23.attn.proj.weight -model.visual.blocks.23.attn.proj.bias -model.visual.blocks.23.mlp.linear_fc1.weight -model.visual.blocks.23.mlp.linear_fc1.bias -model.visual.blocks.23.mlp.linear_fc2.weight -model.visual.blocks.23.mlp.linear_fc2.bias -model.visual.blocks.24.norm1.weight -model.visual.blocks.24.norm1.bias -model.visual.blocks.24.norm2.weight -model.visual.blocks.24.norm2.bias -model.visual.blocks.24.attn.qkv.weight -model.visual.blocks.24.attn.qkv.bias -model.visual.blocks.24.attn.proj.weight -model.visual.blocks.24.attn.proj.bias -model.visual.blocks.24.mlp.linear_fc1.weight -model.visual.blocks.24.mlp.linear_fc1.bias -model.visual.blocks.24.mlp.linear_fc2.weight -model.visual.blocks.24.mlp.linear_fc2.bias -model.visual.blocks.25.norm1.weight -model.visual.blocks.25.norm1.bias -model.visual.blocks.25.norm2.weight -model.visual.blocks.25.norm2.bias -model.visual.blocks.25.attn.qkv.weight -model.visual.blocks.25.attn.qkv.bias -model.visual.blocks.25.attn.proj.weight -model.visual.blocks.25.attn.proj.bias -model.visual.blocks.25.mlp.linear_fc1.weight -model.visual.blocks.25.mlp.linear_fc1.bias -model.visual.blocks.25.mlp.linear_fc2.weight -model.visual.blocks.25.mlp.linear_fc2.bias -model.visual.blocks.26.norm1.weight -model.visual.blocks.26.norm1.bias -model.visual.blocks.26.norm2.weight -model.visual.blocks.26.norm2.bias -model.visual.blocks.26.attn.qkv.weight -model.visual.blocks.26.attn.qkv.bias -model.visual.blocks.26.attn.proj.weight -model.visual.blocks.26.attn.proj.bias -model.visual.blocks.26.mlp.linear_fc1.weight -model.visual.blocks.26.mlp.linear_fc1.bias -model.visual.blocks.26.mlp.linear_fc2.weight -model.visual.blocks.26.mlp.linear_fc2.bias -model.visual.merger.norm.weight -model.visual.merger.norm.bias -model.visual.merger.linear_fc1.weight -model.visual.merger.linear_fc1.bias -model.visual.merger.linear_fc2.weight -model.visual.merger.linear_fc2.bias -model.visual.deepstack_merger_list.0.norm.weight -model.visual.deepstack_merger_list.0.norm.bias -model.visual.deepstack_merger_list.0.linear_fc1.weight -model.visual.deepstack_merger_list.0.linear_fc1.bias -model.visual.deepstack_merger_list.0.linear_fc2.weight -model.visual.deepstack_merger_list.0.linear_fc2.bias -model.visual.deepstack_merger_list.1.norm.weight -model.visual.deepstack_merger_list.1.norm.bias -model.visual.deepstack_merger_list.1.linear_fc1.weight -model.visual.deepstack_merger_list.1.linear_fc1.bias -model.visual.deepstack_merger_list.1.linear_fc2.weight -model.visual.deepstack_merger_list.1.linear_fc2.bias -model.visual.deepstack_merger_list.2.norm.weight -model.visual.deepstack_merger_list.2.norm.bias -model.visual.deepstack_merger_list.2.linear_fc1.weight -model.visual.deepstack_merger_list.2.linear_fc1.bias -model.visual.deepstack_merger_list.2.linear_fc2.weight -model.visual.deepstack_merger_list.2.linear_fc2.bias -model.language_model.embed_tokens.weight -model.language_model.layers.0.self_attn.q_proj.weight -model.language_model.layers.0.self_attn.k_proj.weight -model.language_model.layers.0.self_attn.v_proj.weight -model.language_model.layers.0.self_attn.o_proj.weight -model.language_model.layers.0.self_attn.q_norm.weight -model.language_model.layers.0.self_attn.k_norm.weight -model.language_model.layers.0.mlp.gate.weight -model.language_model.layers.0.mlp.experts.gate_up_proj -model.language_model.layers.0.mlp.experts.down_proj -model.language_model.layers.0.input_layernorm.weight -model.language_model.layers.0.post_attention_layernorm.weight -model.language_model.layers.1.self_attn.q_proj.weight -model.language_model.layers.1.self_attn.k_proj.weight -model.language_model.layers.1.self_attn.v_proj.weight -model.language_model.layers.1.self_attn.o_proj.weight -model.language_model.layers.1.self_attn.q_norm.weight -model.language_model.layers.1.self_attn.k_norm.weight -model.language_model.layers.1.mlp.gate.weight -model.language_model.layers.1.mlp.experts.gate_up_proj -model.language_model.layers.1.mlp.experts.down_proj -model.language_model.layers.1.input_layernorm.weight -model.language_model.layers.1.post_attention_layernorm.weight -model.language_model.layers.2.self_attn.q_proj.weight -model.language_model.layers.2.self_attn.k_proj.weight -model.language_model.layers.2.self_attn.v_proj.weight -model.language_model.layers.2.self_attn.o_proj.weight -model.language_model.layers.2.self_attn.q_norm.weight -model.language_model.layers.2.self_attn.k_norm.weight -model.language_model.layers.2.mlp.gate.weight -model.language_model.layers.2.mlp.experts.gate_up_proj -model.language_model.layers.2.mlp.experts.down_proj -model.language_model.layers.2.input_layernorm.weight -model.language_model.layers.2.post_attention_layernorm.weight -model.language_model.layers.3.self_attn.q_proj.weight -model.language_model.layers.3.self_attn.k_proj.weight -model.language_model.layers.3.self_attn.v_proj.weight -model.language_model.layers.3.self_attn.o_proj.weight -model.language_model.layers.3.self_attn.q_norm.weight -model.language_model.layers.3.self_attn.k_norm.weight -model.language_model.layers.3.mlp.gate.weight -model.language_model.layers.3.mlp.experts.gate_up_proj -model.language_model.layers.3.mlp.experts.down_proj -model.language_model.layers.3.input_layernorm.weight -model.language_model.layers.3.post_attention_layernorm.weight -model.language_model.layers.4.self_attn.q_proj.weight -model.language_model.layers.4.self_attn.k_proj.weight -model.language_model.layers.4.self_attn.v_proj.weight -model.language_model.layers.4.self_attn.o_proj.weight -model.language_model.layers.4.self_attn.q_norm.weight -model.language_model.layers.4.self_attn.k_norm.weight -model.language_model.layers.4.mlp.gate.weight -model.language_model.layers.4.mlp.experts.gate_up_proj -model.language_model.layers.4.mlp.experts.down_proj -model.language_model.layers.4.input_layernorm.weight -model.language_model.layers.4.post_attention_layernorm.weight -model.language_model.layers.5.self_attn.q_proj.weight -model.language_model.layers.5.self_attn.k_proj.weight -model.language_model.layers.5.self_attn.v_proj.weight -model.language_model.layers.5.self_attn.o_proj.weight -model.language_model.layers.5.self_attn.q_norm.weight -model.language_model.layers.5.self_attn.k_norm.weight -model.language_model.layers.5.mlp.gate.weight -model.language_model.layers.5.mlp.experts.gate_up_proj -model.language_model.layers.5.mlp.experts.down_proj -model.language_model.layers.5.input_layernorm.weight -model.language_model.layers.5.post_attention_layernorm.weight -model.language_model.layers.6.self_attn.q_proj.weight -model.language_model.layers.6.self_attn.k_proj.weight -model.language_model.layers.6.self_attn.v_proj.weight -model.language_model.layers.6.self_attn.o_proj.weight -model.language_model.layers.6.self_attn.q_norm.weight -model.language_model.layers.6.self_attn.k_norm.weight -model.language_model.layers.6.mlp.gate.weight -model.language_model.layers.6.mlp.experts.gate_up_proj -model.language_model.layers.6.mlp.experts.down_proj -model.language_model.layers.6.input_layernorm.weight -model.language_model.layers.6.post_attention_layernorm.weight -model.language_model.layers.7.self_attn.q_proj.weight -model.language_model.layers.7.self_attn.k_proj.weight -model.language_model.layers.7.self_attn.v_proj.weight -model.language_model.layers.7.self_attn.o_proj.weight -model.language_model.layers.7.self_attn.q_norm.weight -model.language_model.layers.7.self_attn.k_norm.weight -model.language_model.layers.7.mlp.gate.weight -model.language_model.layers.7.mlp.experts.gate_up_proj -model.language_model.layers.7.mlp.experts.down_proj -model.language_model.layers.7.input_layernorm.weight -model.language_model.layers.7.post_attention_layernorm.weight -model.language_model.layers.8.self_attn.q_proj.weight -model.language_model.layers.8.self_attn.k_proj.weight -model.language_model.layers.8.self_attn.v_proj.weight -model.language_model.layers.8.self_attn.o_proj.weight -model.language_model.layers.8.self_attn.q_norm.weight -model.language_model.layers.8.self_attn.k_norm.weight -model.language_model.layers.8.mlp.gate.weight -model.language_model.layers.8.mlp.experts.gate_up_proj -model.language_model.layers.8.mlp.experts.down_proj -model.language_model.layers.8.input_layernorm.weight -model.language_model.layers.8.post_attention_layernorm.weight -model.language_model.layers.9.self_attn.q_proj.weight -model.language_model.layers.9.self_attn.k_proj.weight -model.language_model.layers.9.self_attn.v_proj.weight -model.language_model.layers.9.self_attn.o_proj.weight -model.language_model.layers.9.self_attn.q_norm.weight -model.language_model.layers.9.self_attn.k_norm.weight -model.language_model.layers.9.mlp.gate.weight -model.language_model.layers.9.mlp.experts.gate_up_proj -model.language_model.layers.9.mlp.experts.down_proj -model.language_model.layers.9.input_layernorm.weight -model.language_model.layers.9.post_attention_layernorm.weight -model.language_model.layers.10.self_attn.q_proj.weight -model.language_model.layers.10.self_attn.k_proj.weight -model.language_model.layers.10.self_attn.v_proj.weight -model.language_model.layers.10.self_attn.o_proj.weight -model.language_model.layers.10.self_attn.q_norm.weight -model.language_model.layers.10.self_attn.k_norm.weight -model.language_model.layers.10.mlp.gate.weight -model.language_model.layers.10.mlp.experts.gate_up_proj -model.language_model.layers.10.mlp.experts.down_proj -model.language_model.layers.10.input_layernorm.weight -model.language_model.layers.10.post_attention_layernorm.weight -model.language_model.layers.11.self_attn.q_proj.weight -model.language_model.layers.11.self_attn.k_proj.weight -model.language_model.layers.11.self_attn.v_proj.weight -model.language_model.layers.11.self_attn.o_proj.weight -model.language_model.layers.11.self_attn.q_norm.weight -model.language_model.layers.11.self_attn.k_norm.weight -model.language_model.layers.11.mlp.gate.weight -model.language_model.layers.11.mlp.experts.gate_up_proj -model.language_model.layers.11.mlp.experts.down_proj -model.language_model.layers.11.input_layernorm.weight -model.language_model.layers.11.post_attention_layernorm.weight -model.language_model.layers.12.self_attn.q_proj.weight -model.language_model.layers.12.self_attn.k_proj.weight -model.language_model.layers.12.self_attn.v_proj.weight -model.language_model.layers.12.self_attn.o_proj.weight -model.language_model.layers.12.self_attn.q_norm.weight -model.language_model.layers.12.self_attn.k_norm.weight -model.language_model.layers.12.mlp.gate.weight -model.language_model.layers.12.mlp.experts.gate_up_proj -model.language_model.layers.12.mlp.experts.down_proj -model.language_model.layers.12.input_layernorm.weight -model.language_model.layers.12.post_attention_layernorm.weight -model.language_model.layers.13.self_attn.q_proj.weight -model.language_model.layers.13.self_attn.k_proj.weight -model.language_model.layers.13.self_attn.v_proj.weight -model.language_model.layers.13.self_attn.o_proj.weight -model.language_model.layers.13.self_attn.q_norm.weight -model.language_model.layers.13.self_attn.k_norm.weight -model.language_model.layers.13.mlp.gate.weight -model.language_model.layers.13.mlp.experts.gate_up_proj -model.language_model.layers.13.mlp.experts.down_proj -model.language_model.layers.13.input_layernorm.weight -model.language_model.layers.13.post_attention_layernorm.weight -model.language_model.layers.14.self_attn.q_proj.weight -model.language_model.layers.14.self_attn.k_proj.weight -model.language_model.layers.14.self_attn.v_proj.weight -model.language_model.layers.14.self_attn.o_proj.weight -model.language_model.layers.14.self_attn.q_norm.weight -model.language_model.layers.14.self_attn.k_norm.weight -model.language_model.layers.14.mlp.gate.weight -model.language_model.layers.14.mlp.experts.gate_up_proj -model.language_model.layers.14.mlp.experts.down_proj -model.language_model.layers.14.input_layernorm.weight -model.language_model.layers.14.post_attention_layernorm.weight -model.language_model.layers.15.self_attn.q_proj.weight -model.language_model.layers.15.self_attn.k_proj.weight -model.language_model.layers.15.self_attn.v_proj.weight -model.language_model.layers.15.self_attn.o_proj.weight -model.language_model.layers.15.self_attn.q_norm.weight -model.language_model.layers.15.self_attn.k_norm.weight -model.language_model.layers.15.mlp.gate.weight -model.language_model.layers.15.mlp.experts.gate_up_proj -model.language_model.layers.15.mlp.experts.down_proj -model.language_model.layers.15.input_layernorm.weight -model.language_model.layers.15.post_attention_layernorm.weight -model.language_model.layers.16.self_attn.q_proj.weight -model.language_model.layers.16.self_attn.k_proj.weight -model.language_model.layers.16.self_attn.v_proj.weight -model.language_model.layers.16.self_attn.o_proj.weight -model.language_model.layers.16.self_attn.q_norm.weight -model.language_model.layers.16.self_attn.k_norm.weight -model.language_model.layers.16.mlp.gate.weight -model.language_model.layers.16.mlp.experts.gate_up_proj -model.language_model.layers.16.mlp.experts.down_proj -model.language_model.layers.16.input_layernorm.weight -model.language_model.layers.16.post_attention_layernorm.weight -model.language_model.layers.17.self_attn.q_proj.weight -model.language_model.layers.17.self_attn.k_proj.weight -model.language_model.layers.17.self_attn.v_proj.weight -model.language_model.layers.17.self_attn.o_proj.weight -model.language_model.layers.17.self_attn.q_norm.weight -model.language_model.layers.17.self_attn.k_norm.weight -model.language_model.layers.17.mlp.gate.weight -model.language_model.layers.17.mlp.experts.gate_up_proj -model.language_model.layers.17.mlp.experts.down_proj -model.language_model.layers.17.input_layernorm.weight -model.language_model.layers.17.post_attention_layernorm.weight -model.language_model.layers.18.self_attn.q_proj.weight -model.language_model.layers.18.self_attn.k_proj.weight -model.language_model.layers.18.self_attn.v_proj.weight -model.language_model.layers.18.self_attn.o_proj.weight -model.language_model.layers.18.self_attn.q_norm.weight -model.language_model.layers.18.self_attn.k_norm.weight -model.language_model.layers.18.mlp.gate.weight -model.language_model.layers.18.mlp.experts.gate_up_proj -model.language_model.layers.18.mlp.experts.down_proj -model.language_model.layers.18.input_layernorm.weight -model.language_model.layers.18.post_attention_layernorm.weight -model.language_model.layers.19.self_attn.q_proj.weight -model.language_model.layers.19.self_attn.k_proj.weight -model.language_model.layers.19.self_attn.v_proj.weight -model.language_model.layers.19.self_attn.o_proj.weight -model.language_model.layers.19.self_attn.q_norm.weight -model.language_model.layers.19.self_attn.k_norm.weight -model.language_model.layers.19.mlp.gate.weight -model.language_model.layers.19.mlp.experts.gate_up_proj -model.language_model.layers.19.mlp.experts.down_proj -model.language_model.layers.19.input_layernorm.weight -model.language_model.layers.19.post_attention_layernorm.weight -model.language_model.layers.20.self_attn.q_proj.weight -model.language_model.layers.20.self_attn.k_proj.weight -model.language_model.layers.20.self_attn.v_proj.weight -model.language_model.layers.20.self_attn.o_proj.weight -model.language_model.layers.20.self_attn.q_norm.weight -model.language_model.layers.20.self_attn.k_norm.weight -model.language_model.layers.20.mlp.gate.weight -model.language_model.layers.20.mlp.experts.gate_up_proj -model.language_model.layers.20.mlp.experts.down_proj -model.language_model.layers.20.input_layernorm.weight -model.language_model.layers.20.post_attention_layernorm.weight -model.language_model.layers.21.self_attn.q_proj.weight -model.language_model.layers.21.self_attn.k_proj.weight -model.language_model.layers.21.self_attn.v_proj.weight -model.language_model.layers.21.self_attn.o_proj.weight -model.language_model.layers.21.self_attn.q_norm.weight -model.language_model.layers.21.self_attn.k_norm.weight -model.language_model.layers.21.mlp.gate.weight -model.language_model.layers.21.mlp.experts.gate_up_proj -model.language_model.layers.21.mlp.experts.down_proj -model.language_model.layers.21.input_layernorm.weight -model.language_model.layers.21.post_attention_layernorm.weight -model.language_model.layers.22.self_attn.q_proj.weight -model.language_model.layers.22.self_attn.k_proj.weight -model.language_model.layers.22.self_attn.v_proj.weight -model.language_model.layers.22.self_attn.o_proj.weight -model.language_model.layers.22.self_attn.q_norm.weight -model.language_model.layers.22.self_attn.k_norm.weight -model.language_model.layers.22.mlp.gate.weight -model.language_model.layers.22.mlp.experts.gate_up_proj -model.language_model.layers.22.mlp.experts.down_proj -model.language_model.layers.22.input_layernorm.weight -model.language_model.layers.22.post_attention_layernorm.weight -model.language_model.layers.23.self_attn.q_proj.weight -model.language_model.layers.23.self_attn.k_proj.weight -model.language_model.layers.23.self_attn.v_proj.weight -model.language_model.layers.23.self_attn.o_proj.weight -model.language_model.layers.23.self_attn.q_norm.weight -model.language_model.layers.23.self_attn.k_norm.weight -model.language_model.layers.23.mlp.gate.weight -model.language_model.layers.23.mlp.experts.gate_up_proj -model.language_model.layers.23.mlp.experts.down_proj -model.language_model.layers.23.input_layernorm.weight -model.language_model.layers.23.post_attention_layernorm.weight -model.language_model.layers.24.self_attn.q_proj.weight -model.language_model.layers.24.self_attn.k_proj.weight -model.language_model.layers.24.self_attn.v_proj.weight -model.language_model.layers.24.self_attn.o_proj.weight -model.language_model.layers.24.self_attn.q_norm.weight -model.language_model.layers.24.self_attn.k_norm.weight -model.language_model.layers.24.mlp.gate.weight -model.language_model.layers.24.mlp.experts.gate_up_proj -model.language_model.layers.24.mlp.experts.down_proj -model.language_model.layers.24.input_layernorm.weight -model.language_model.layers.24.post_attention_layernorm.weight -model.language_model.layers.25.self_attn.q_proj.weight -model.language_model.layers.25.self_attn.k_proj.weight -model.language_model.layers.25.self_attn.v_proj.weight -model.language_model.layers.25.self_attn.o_proj.weight -model.language_model.layers.25.self_attn.q_norm.weight -model.language_model.layers.25.self_attn.k_norm.weight -model.language_model.layers.25.mlp.gate.weight -model.language_model.layers.25.mlp.experts.gate_up_proj -model.language_model.layers.25.mlp.experts.down_proj -model.language_model.layers.25.input_layernorm.weight -model.language_model.layers.25.post_attention_layernorm.weight -model.language_model.layers.26.self_attn.q_proj.weight -model.language_model.layers.26.self_attn.k_proj.weight -model.language_model.layers.26.self_attn.v_proj.weight -model.language_model.layers.26.self_attn.o_proj.weight -model.language_model.layers.26.self_attn.q_norm.weight -model.language_model.layers.26.self_attn.k_norm.weight -model.language_model.layers.26.mlp.gate.weight -model.language_model.layers.26.mlp.experts.gate_up_proj -model.language_model.layers.26.mlp.experts.down_proj -model.language_model.layers.26.input_layernorm.weight -model.language_model.layers.26.post_attention_layernorm.weight -model.language_model.layers.27.self_attn.q_proj.weight -model.language_model.layers.27.self_attn.k_proj.weight -model.language_model.layers.27.self_attn.v_proj.weight -model.language_model.layers.27.self_attn.o_proj.weight -model.language_model.layers.27.self_attn.q_norm.weight -model.language_model.layers.27.self_attn.k_norm.weight -model.language_model.layers.27.mlp.gate.weight -model.language_model.layers.27.mlp.experts.gate_up_proj -model.language_model.layers.27.mlp.experts.down_proj -model.language_model.layers.27.input_layernorm.weight -model.language_model.layers.27.post_attention_layernorm.weight -model.language_model.layers.28.self_attn.q_proj.weight -model.language_model.layers.28.self_attn.k_proj.weight -model.language_model.layers.28.self_attn.v_proj.weight -model.language_model.layers.28.self_attn.o_proj.weight -model.language_model.layers.28.self_attn.q_norm.weight -model.language_model.layers.28.self_attn.k_norm.weight -model.language_model.layers.28.mlp.gate.weight -model.language_model.layers.28.mlp.experts.gate_up_proj -model.language_model.layers.28.mlp.experts.down_proj -model.language_model.layers.28.input_layernorm.weight -model.language_model.layers.28.post_attention_layernorm.weight -model.language_model.layers.29.self_attn.q_proj.weight -model.language_model.layers.29.self_attn.k_proj.weight -model.language_model.layers.29.self_attn.v_proj.weight -model.language_model.layers.29.self_attn.o_proj.weight -model.language_model.layers.29.self_attn.q_norm.weight -model.language_model.layers.29.self_attn.k_norm.weight -model.language_model.layers.29.mlp.gate.weight -model.language_model.layers.29.mlp.experts.gate_up_proj -model.language_model.layers.29.mlp.experts.down_proj -model.language_model.layers.29.input_layernorm.weight -model.language_model.layers.29.post_attention_layernorm.weight -model.language_model.layers.30.self_attn.q_proj.weight -model.language_model.layers.30.self_attn.k_proj.weight -model.language_model.layers.30.self_attn.v_proj.weight -model.language_model.layers.30.self_attn.o_proj.weight -model.language_model.layers.30.self_attn.q_norm.weight -model.language_model.layers.30.self_attn.k_norm.weight -model.language_model.layers.30.mlp.gate.weight -model.language_model.layers.30.mlp.experts.gate_up_proj -model.language_model.layers.30.mlp.experts.down_proj -model.language_model.layers.30.input_layernorm.weight -model.language_model.layers.30.post_attention_layernorm.weight -model.language_model.layers.31.self_attn.q_proj.weight -model.language_model.layers.31.self_attn.k_proj.weight -model.language_model.layers.31.self_attn.v_proj.weight -model.language_model.layers.31.self_attn.o_proj.weight -model.language_model.layers.31.self_attn.q_norm.weight -model.language_model.layers.31.self_attn.k_norm.weight -model.language_model.layers.31.mlp.gate.weight -model.language_model.layers.31.mlp.experts.gate_up_proj -model.language_model.layers.31.mlp.experts.down_proj -model.language_model.layers.31.input_layernorm.weight -model.language_model.layers.31.post_attention_layernorm.weight -model.language_model.layers.32.self_attn.q_proj.weight -model.language_model.layers.32.self_attn.k_proj.weight -model.language_model.layers.32.self_attn.v_proj.weight -model.language_model.layers.32.self_attn.o_proj.weight -model.language_model.layers.32.self_attn.q_norm.weight -model.language_model.layers.32.self_attn.k_norm.weight -model.language_model.layers.32.mlp.gate.weight -model.language_model.layers.32.mlp.experts.gate_up_proj -model.language_model.layers.32.mlp.experts.down_proj -model.language_model.layers.32.input_layernorm.weight -model.language_model.layers.32.post_attention_layernorm.weight -model.language_model.layers.33.self_attn.q_proj.weight -model.language_model.layers.33.self_attn.k_proj.weight -model.language_model.layers.33.self_attn.v_proj.weight -model.language_model.layers.33.self_attn.o_proj.weight -model.language_model.layers.33.self_attn.q_norm.weight -model.language_model.layers.33.self_attn.k_norm.weight -model.language_model.layers.33.mlp.gate.weight -model.language_model.layers.33.mlp.experts.gate_up_proj -model.language_model.layers.33.mlp.experts.down_proj -model.language_model.layers.33.input_layernorm.weight -model.language_model.layers.33.post_attention_layernorm.weight -model.language_model.layers.34.self_attn.q_proj.weight -model.language_model.layers.34.self_attn.k_proj.weight -model.language_model.layers.34.self_attn.v_proj.weight -model.language_model.layers.34.self_attn.o_proj.weight -model.language_model.layers.34.self_attn.q_norm.weight -model.language_model.layers.34.self_attn.k_norm.weight -model.language_model.layers.34.mlp.gate.weight -model.language_model.layers.34.mlp.experts.gate_up_proj -model.language_model.layers.34.mlp.experts.down_proj -model.language_model.layers.34.input_layernorm.weight -model.language_model.layers.34.post_attention_layernorm.weight -model.language_model.layers.35.self_attn.q_proj.weight -model.language_model.layers.35.self_attn.k_proj.weight -model.language_model.layers.35.self_attn.v_proj.weight -model.language_model.layers.35.self_attn.o_proj.weight -model.language_model.layers.35.self_attn.q_norm.weight -model.language_model.layers.35.self_attn.k_norm.weight -model.language_model.layers.35.mlp.gate.weight -model.language_model.layers.35.mlp.experts.gate_up_proj -model.language_model.layers.35.mlp.experts.down_proj -model.language_model.layers.35.input_layernorm.weight -model.language_model.layers.35.post_attention_layernorm.weight -model.language_model.layers.36.self_attn.q_proj.weight -model.language_model.layers.36.self_attn.k_proj.weight -model.language_model.layers.36.self_attn.v_proj.weight -model.language_model.layers.36.self_attn.o_proj.weight -model.language_model.layers.36.self_attn.q_norm.weight -model.language_model.layers.36.self_attn.k_norm.weight -model.language_model.layers.36.mlp.gate.weight -model.language_model.layers.36.mlp.experts.gate_up_proj -model.language_model.layers.36.mlp.experts.down_proj -model.language_model.layers.36.input_layernorm.weight -model.language_model.layers.36.post_attention_layernorm.weight -model.language_model.layers.37.self_attn.q_proj.weight -model.language_model.layers.37.self_attn.k_proj.weight -model.language_model.layers.37.self_attn.v_proj.weight -model.language_model.layers.37.self_attn.o_proj.weight -model.language_model.layers.37.self_attn.q_norm.weight -model.language_model.layers.37.self_attn.k_norm.weight -model.language_model.layers.37.mlp.gate.weight -model.language_model.layers.37.mlp.experts.gate_up_proj -model.language_model.layers.37.mlp.experts.down_proj -model.language_model.layers.37.input_layernorm.weight -model.language_model.layers.37.post_attention_layernorm.weight -model.language_model.layers.38.self_attn.q_proj.weight -model.language_model.layers.38.self_attn.k_proj.weight -model.language_model.layers.38.self_attn.v_proj.weight -model.language_model.layers.38.self_attn.o_proj.weight -model.language_model.layers.38.self_attn.q_norm.weight -model.language_model.layers.38.self_attn.k_norm.weight -model.language_model.layers.38.mlp.gate.weight -model.language_model.layers.38.mlp.experts.gate_up_proj -model.language_model.layers.38.mlp.experts.down_proj -model.language_model.layers.38.input_layernorm.weight -model.language_model.layers.38.post_attention_layernorm.weight -model.language_model.layers.39.self_attn.q_proj.weight -model.language_model.layers.39.self_attn.k_proj.weight -model.language_model.layers.39.self_attn.v_proj.weight -model.language_model.layers.39.self_attn.o_proj.weight -model.language_model.layers.39.self_attn.q_norm.weight -model.language_model.layers.39.self_attn.k_norm.weight -model.language_model.layers.39.mlp.gate.weight -model.language_model.layers.39.mlp.experts.gate_up_proj -model.language_model.layers.39.mlp.experts.down_proj -model.language_model.layers.39.input_layernorm.weight -model.language_model.layers.39.post_attention_layernorm.weight -model.language_model.layers.40.self_attn.q_proj.weight -model.language_model.layers.40.self_attn.k_proj.weight -model.language_model.layers.40.self_attn.v_proj.weight -model.language_model.layers.40.self_attn.o_proj.weight -model.language_model.layers.40.self_attn.q_norm.weight -model.language_model.layers.40.self_attn.k_norm.weight -model.language_model.layers.40.mlp.gate.weight -model.language_model.layers.40.mlp.experts.gate_up_proj -model.language_model.layers.40.mlp.experts.down_proj -model.language_model.layers.40.input_layernorm.weight -model.language_model.layers.40.post_attention_layernorm.weight -model.language_model.layers.41.self_attn.q_proj.weight -model.language_model.layers.41.self_attn.k_proj.weight -model.language_model.layers.41.self_attn.v_proj.weight -model.language_model.layers.41.self_attn.o_proj.weight -model.language_model.layers.41.self_attn.q_norm.weight -model.language_model.layers.41.self_attn.k_norm.weight -model.language_model.layers.41.mlp.gate.weight -model.language_model.layers.41.mlp.experts.gate_up_proj -model.language_model.layers.41.mlp.experts.down_proj -model.language_model.layers.41.input_layernorm.weight -model.language_model.layers.41.post_attention_layernorm.weight -model.language_model.layers.42.self_attn.q_proj.weight -model.language_model.layers.42.self_attn.k_proj.weight -model.language_model.layers.42.self_attn.v_proj.weight -model.language_model.layers.42.self_attn.o_proj.weight -model.language_model.layers.42.self_attn.q_norm.weight -model.language_model.layers.42.self_attn.k_norm.weight -model.language_model.layers.42.mlp.gate.weight -model.language_model.layers.42.mlp.experts.gate_up_proj -model.language_model.layers.42.mlp.experts.down_proj -model.language_model.layers.42.input_layernorm.weight -model.language_model.layers.42.post_attention_layernorm.weight -model.language_model.layers.43.self_attn.q_proj.weight -model.language_model.layers.43.self_attn.k_proj.weight -model.language_model.layers.43.self_attn.v_proj.weight -model.language_model.layers.43.self_attn.o_proj.weight -model.language_model.layers.43.self_attn.q_norm.weight -model.language_model.layers.43.self_attn.k_norm.weight -model.language_model.layers.43.mlp.gate.weight -model.language_model.layers.43.mlp.experts.gate_up_proj -model.language_model.layers.43.mlp.experts.down_proj -model.language_model.layers.43.input_layernorm.weight -model.language_model.layers.43.post_attention_layernorm.weight -model.language_model.layers.44.self_attn.q_proj.weight -model.language_model.layers.44.self_attn.k_proj.weight -model.language_model.layers.44.self_attn.v_proj.weight -model.language_model.layers.44.self_attn.o_proj.weight -model.language_model.layers.44.self_attn.q_norm.weight -model.language_model.layers.44.self_attn.k_norm.weight -model.language_model.layers.44.mlp.gate.weight -model.language_model.layers.44.mlp.experts.gate_up_proj -model.language_model.layers.44.mlp.experts.down_proj -model.language_model.layers.44.input_layernorm.weight -model.language_model.layers.44.post_attention_layernorm.weight -model.language_model.layers.45.self_attn.q_proj.weight -model.language_model.layers.45.self_attn.k_proj.weight -model.language_model.layers.45.self_attn.v_proj.weight -model.language_model.layers.45.self_attn.o_proj.weight -model.language_model.layers.45.self_attn.q_norm.weight -model.language_model.layers.45.self_attn.k_norm.weight -model.language_model.layers.45.mlp.gate.weight -model.language_model.layers.45.mlp.experts.gate_up_proj -model.language_model.layers.45.mlp.experts.down_proj -model.language_model.layers.45.input_layernorm.weight -model.language_model.layers.45.post_attention_layernorm.weight -model.language_model.layers.46.self_attn.q_proj.weight -model.language_model.layers.46.self_attn.k_proj.weight -model.language_model.layers.46.self_attn.v_proj.weight -model.language_model.layers.46.self_attn.o_proj.weight -model.language_model.layers.46.self_attn.q_norm.weight -model.language_model.layers.46.self_attn.k_norm.weight -model.language_model.layers.46.mlp.gate.weight -model.language_model.layers.46.mlp.experts.gate_up_proj -model.language_model.layers.46.mlp.experts.down_proj -model.language_model.layers.46.input_layernorm.weight -model.language_model.layers.46.post_attention_layernorm.weight -model.language_model.layers.47.self_attn.q_proj.weight -model.language_model.layers.47.self_attn.k_proj.weight -model.language_model.layers.47.self_attn.v_proj.weight -model.language_model.layers.47.self_attn.o_proj.weight -model.language_model.layers.47.self_attn.q_norm.weight -model.language_model.layers.47.self_attn.k_norm.weight -model.language_model.layers.47.mlp.gate.weight -model.language_model.layers.47.mlp.experts.gate_up_proj -model.language_model.layers.47.mlp.experts.down_proj -model.language_model.layers.47.input_layernorm.weight -model.language_model.layers.47.post_attention_layernorm.weight -model.language_model.norm.weight -lm_head.weight From a2a886141b547cf437969cbee19d1b70816c8f32 Mon Sep 17 00:00:00 2001 From: zouxinyi0625 Date: Wed, 15 Oct 2025 10:47:30 +0800 Subject: [PATCH 05/17] debug --- chatlearn/algorithm/grpo_utils/loss_gallery.py | 8 ++++---- chatlearn/algorithm/grpo_utils/policy_trainer.py | 5 +++-- chatlearn/models/fsdp_module.py | 8 ++++---- 3 files changed, 11 insertions(+), 10 deletions(-) diff --git a/chatlearn/algorithm/grpo_utils/loss_gallery.py b/chatlearn/algorithm/grpo_utils/loss_gallery.py index 5b7f3165..b93b5d7b 100644 --- a/chatlearn/algorithm/grpo_utils/loss_gallery.py +++ b/chatlearn/algorithm/grpo_utils/loss_gallery.py @@ -16,8 +16,8 @@ def calculate_grpo_loss( # clip logprobs_diff before exp to avoid overflow logprobs_diff = torch.clamp(logprobs_diff, max=diff_clip_ratio) ratio = torch.exp(logprobs_diff) - # advantages = torch.tensor(advantages).to(logprobs_diff.device) - advantages = torch.tensor(advantages, dtype=logprobs_diff.dtype, device=logprobs_diff.device) + advantages = torch.tensor(advantages).to(logprobs_diff.device) + # advantages = torch.tensor(advantages, dtype=logprobs_diff.dtype, device=logprobs_diff.device) pg_loss = -advantages.unsqueeze(-1) * ratio # Upper and lower bound clip pg_loss_2 = -advantages.unsqueeze(-1) * torch.clamp( @@ -51,8 +51,8 @@ def calculate_gspo_loss( logprobs_diff = torch.clamp(seq_logprobs_diff, max=diff_clip_ratio) ratio = torch.exp(logprobs_diff) - # advantages = torch.tensor(advantages).to(logprobs_diff.device) - advantages = torch.tensor(advantages, dtype=logprobs_diff.dtype, device=logprobs_diff.device) + advantages = torch.tensor(advantages).to(logprobs_diff.device) + # advantages = torch.tensor(advantages, dtype=logprobs_diff.dtype, device=logprobs_diff.device) advantages.unsqueeze_(-1) pg_loss = -advantages * ratio diff --git a/chatlearn/algorithm/grpo_utils/policy_trainer.py b/chatlearn/algorithm/grpo_utils/policy_trainer.py index 36fb87e4..1326ccb9 100644 --- a/chatlearn/algorithm/grpo_utils/policy_trainer.py +++ b/chatlearn/algorithm/grpo_utils/policy_trainer.py @@ -270,8 +270,7 @@ def train_step(self, data_list: List[Dict[str, Any]], **kwargs): # pylint: disab total_loss = total_loss - self.module_args.entropy_coef * entropy_loss_mean if self.module_args.kl_coef > 0: total_loss = total_loss + self.module_args.kl_coef * kl_loss_mean - # breakpoint() - # total_loss = total_loss.bfloat16() + total_loss.backward() pg_loss_list.append(pg_loss.detach()) @@ -311,6 +310,8 @@ def forward_step(self, data: List[Dict[str, Any]], **kwargs) -> List[Dict[str, A inputs[k] = to_device(torch.cuda.current_device(), v) with torch.no_grad(): if self.runtime_args.model_type == 'vlm': + # a = self.model.visual.state_dict()['blocks.4.attn.proj.bias'].full_tensor() + output = self.model( input_ids=inputs['all_tokens'], pixel_values=inputs['pixel_values'], diff --git a/chatlearn/models/fsdp_module.py b/chatlearn/models/fsdp_module.py index ef301f92..14fe131f 100644 --- a/chatlearn/models/fsdp_module.py +++ b/chatlearn/models/fsdp_module.py @@ -344,7 +344,7 @@ def model_setup(self): model.gradient_checkpointing_enable(gradient_checkpointing_kwargs={'use_reentrant': False}) # fsdp2 warp - mix_precision_config = MixedPrecisionPolicy(param_dtype=torch.float32, reduce_dtype=torch.float32, cast_forward_inputs=True) + mix_precision_config = MixedPrecisionPolicy(param_dtype=torch.bfloat16, reduce_dtype=torch.float32, cast_forward_inputs=True) fsdp_kwargs = { "mesh": self.device_mesh, "mp_policy": mix_precision_config, @@ -355,6 +355,7 @@ def model_setup(self): if isinstance(fsdp_transformer_layer_cls_to_wrap, str): fsdp_transformer_layer_cls_to_wrap = [fsdp_transformer_layer_cls_to_wrap] modules = [] + for module in model.modules(): if module.__class__.__name__ in fsdp_transformer_layer_cls_to_wrap or \ (isinstance(module, nn.Embedding) and not model.config.tie_word_embeddings): @@ -363,7 +364,7 @@ def model_setup(self): for module in modules: fully_shard(module, **fsdp_kwargs) fully_shard(model, **fsdp_kwargs) - + if self.module_args.meta_init: shard_dict = self.get_dtensor(model, args.load) model.load_state_dict(shard_dict, assign=True) @@ -371,8 +372,7 @@ def model_setup(self): self.model = model self.model.to(torch.float32) - # breakpoint() - + if not self.trainable: self.optimizer = None self.model.eval() From 5184537e0017f8617c60a16888cd25196685e4ef Mon Sep 17 00:00:00 2001 From: zouxinyi0625 Date: Wed, 15 Oct 2025 13:31:47 +0800 Subject: [PATCH 06/17] update qwen3vl --- .../algorithm/grpo_utils/loss_gallery.py | 2 -- .../algorithm/grpo_utils/policy_trainer.py | 3 -- chatlearn/models/fsdp_module.py | 22 ++++++------- chatlearn/models/patches/monkey_patch.py | 14 ++++++-- .../patches/transformers/qwen3_vl_patch.py | 33 +++++++++++++++++++ chatlearn/models/sglang_module.py | 8 +++-- 6 files changed, 62 insertions(+), 20 deletions(-) create mode 100644 chatlearn/models/patches/transformers/qwen3_vl_patch.py diff --git a/chatlearn/algorithm/grpo_utils/loss_gallery.py b/chatlearn/algorithm/grpo_utils/loss_gallery.py index b93b5d7b..0463cb6c 100644 --- a/chatlearn/algorithm/grpo_utils/loss_gallery.py +++ b/chatlearn/algorithm/grpo_utils/loss_gallery.py @@ -17,7 +17,6 @@ def calculate_grpo_loss( logprobs_diff = torch.clamp(logprobs_diff, max=diff_clip_ratio) ratio = torch.exp(logprobs_diff) advantages = torch.tensor(advantages).to(logprobs_diff.device) - # advantages = torch.tensor(advantages, dtype=logprobs_diff.dtype, device=logprobs_diff.device) pg_loss = -advantages.unsqueeze(-1) * ratio # Upper and lower bound clip pg_loss_2 = -advantages.unsqueeze(-1) * torch.clamp( @@ -52,7 +51,6 @@ def calculate_gspo_loss( ratio = torch.exp(logprobs_diff) advantages = torch.tensor(advantages).to(logprobs_diff.device) - # advantages = torch.tensor(advantages, dtype=logprobs_diff.dtype, device=logprobs_diff.device) advantages.unsqueeze_(-1) pg_loss = -advantages * ratio diff --git a/chatlearn/algorithm/grpo_utils/policy_trainer.py b/chatlearn/algorithm/grpo_utils/policy_trainer.py index 1326ccb9..8a69b45f 100644 --- a/chatlearn/algorithm/grpo_utils/policy_trainer.py +++ b/chatlearn/algorithm/grpo_utils/policy_trainer.py @@ -270,7 +270,6 @@ def train_step(self, data_list: List[Dict[str, Any]], **kwargs): # pylint: disab total_loss = total_loss - self.module_args.entropy_coef * entropy_loss_mean if self.module_args.kl_coef > 0: total_loss = total_loss + self.module_args.kl_coef * kl_loss_mean - total_loss.backward() pg_loss_list.append(pg_loss.detach()) @@ -310,8 +309,6 @@ def forward_step(self, data: List[Dict[str, Any]], **kwargs) -> List[Dict[str, A inputs[k] = to_device(torch.cuda.current_device(), v) with torch.no_grad(): if self.runtime_args.model_type == 'vlm': - # a = self.model.visual.state_dict()['blocks.4.attn.proj.bias'].full_tensor() - output = self.model( input_ids=inputs['all_tokens'], pixel_values=inputs['pixel_values'], diff --git a/chatlearn/models/fsdp_module.py b/chatlearn/models/fsdp_module.py index 14fe131f..a9f8961d 100644 --- a/chatlearn/models/fsdp_module.py +++ b/chatlearn/models/fsdp_module.py @@ -23,7 +23,6 @@ import numpy as np from packaging.version import Version as PkgVersion -import transformers import torch from torch import Tensor import torch.distributed as dist @@ -258,10 +257,9 @@ def create_model(self, model_path: str , torch_dtype: torch.dtype, meta_init: bo attn_implementation="flash_attention_2", trust_remote_code=self.module_args.trust_remote_code ) - if PkgVersion(transformers.__version__)==PkgVersion('4.51.3'): - # vl patch needed for transformers 4.51.3 - from chatlearn.models.patches.monkey_patch import apply_qwenvl - apply_qwenvl(model) + + from chatlearn.models.patches.monkey_patch import apply_qwenvl + apply_qwenvl(model) assert self.sp_size == 1, "VL model only support sp_size=1" else: @@ -355,7 +353,6 @@ def model_setup(self): if isinstance(fsdp_transformer_layer_cls_to_wrap, str): fsdp_transformer_layer_cls_to_wrap = [fsdp_transformer_layer_cls_to_wrap] modules = [] - for module in model.modules(): if module.__class__.__name__ in fsdp_transformer_layer_cls_to_wrap or \ (isinstance(module, nn.Embedding) and not model.config.tie_word_embeddings): @@ -364,7 +361,7 @@ def model_setup(self): for module in modules: fully_shard(module, **fsdp_kwargs) fully_shard(model, **fsdp_kwargs) - + if self.module_args.meta_init: shard_dict = self.get_dtensor(model, args.load) model.load_state_dict(shard_dict, assign=True) @@ -372,7 +369,7 @@ def model_setup(self): self.model = model self.model.to(torch.float32) - + if not self.trainable: self.optimizer = None self.model.eval() @@ -508,10 +505,13 @@ def get_weight_ipc_handles_by_name(self, block_name: List[str]): if rollout_engine == "sglang": # lazy import sglang from sglang.srt.utils import MultiprocessingSerializer - # from sglang.srt.patch_torch import monkey_patch_torch_reductions + import sglang + if PkgVersion(sglang.__version__)>=PkgVersion('0.5.3'): + from sglang.srt.utils.patch_torch import monkey_patch_torch_reductions + else: + from sglang.srt.patch_torch import monkey_patch_torch_reductions + monkey_patch_torch_reductions() - # monkey_patch_torch_reductions() - flattened_tensor, metadatas = self.convert_block2flattened_bucket( block_parameter ) diff --git a/chatlearn/models/patches/monkey_patch.py b/chatlearn/models/patches/monkey_patch.py index a0be47d1..c7c281bb 100644 --- a/chatlearn/models/patches/monkey_patch.py +++ b/chatlearn/models/patches/monkey_patch.py @@ -13,6 +13,9 @@ # limitations under the License. # ============================================================================== """Apply patches for different model architectures""" +from packaging.version import Version as PkgVersion +import transformers + def apply_sp_monkey_patch(model_config): print(f"applying sequence parallel patches for {model_config.architectures}") if model_config.architectures[0] == "Qwen2ForCausalLM": @@ -42,8 +45,15 @@ def apply_group_gemm(model): def apply_qwenvl(model): print(f"applying qwenvl patches for {model.config.architectures[0]}") if model.config.architectures[0] == "Qwen2_5_VLForConditionalGeneration": - from chatlearn.models.patches.transformers.qwen2_5_vl_patch import apply_qwenvl_patch \ + if PkgVersion(transformers.__version__)==PkgVersion('4.51.3'): + # vl2.5 patch needed for transformers 4.51.3 + from chatlearn.models.patches.transformers.qwen2_5_vl_patch import apply_qwenvl_patch \ + # pylint: disable=import-outside-toplevel + apply_qwenvl_patch() + elif model.config.architectures[0] in ["Qwen3VLForConditionalGeneration", "Qwen3VLMoeForConditionalGeneration"]: + assert PkgVersion(transformers.__version__)>=PkgVersion('4.57.0'), "qwen3vl needed transformers >= 4.57.0" + from chatlearn.models.patches.transformers.qwen3_vl_patch import apply_qwen3vl_patch \ # pylint: disable=import-outside-toplevel - apply_qwenvl_patch() + apply_qwen3vl_patch() else: raise ValueError(f"Unsupported model architecture: {model.config.architectures} for qwenvl patch") diff --git a/chatlearn/models/patches/transformers/qwen3_vl_patch.py b/chatlearn/models/patches/transformers/qwen3_vl_patch.py new file mode 100644 index 00000000..c1145ff7 --- /dev/null +++ b/chatlearn/models/patches/transformers/qwen3_vl_patch.py @@ -0,0 +1,33 @@ +"""patches for qwen3 vl model""" +from typing import Optional +import torch + +def Qwen3VLBlock_patched_forward( + self, + hidden_states: torch.Tensor, + cu_seqlens: torch.Tensor, + rotary_pos_emb: Optional[torch.Tensor] = None, + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, + **kwargs, + ) -> torch.Tensor: + # ========================================================================= + # add force dype change for qwen3_vl or backward will occur type error + hidden_states = hidden_states.to(self.norm1.weight.dtype) + # ========================================================================= + + hidden_states = hidden_states + self.attn( + self.norm1(hidden_states), + cu_seqlens=cu_seqlens, + rotary_pos_emb=rotary_pos_emb, + position_embeddings=position_embeddings, + **kwargs, + ) + hidden_states = hidden_states + self.mlp(self.norm2(hidden_states)) + return hidden_states + +def apply_qwen3vl_patch(): + # pylint: disable=import-outside-toplevel + from transformers.models.qwen3_vl.modeling_qwen3_vl import Qwen3VLVisionBlock + from transformers.models.qwen3_vl_moe.modeling_qwen3_vl_moe import Qwen3VLMoeVisionBlock + Qwen3VLVisionBlock.forward = Qwen3VLBlock_patched_forward + Qwen3VLMoeVisionBlock.forward = Qwen3VLBlock_patched_forward diff --git a/chatlearn/models/sglang_module.py b/chatlearn/models/sglang_module.py index 04357591..a85c5d01 100644 --- a/chatlearn/models/sglang_module.py +++ b/chatlearn/models/sglang_module.py @@ -570,8 +570,12 @@ def parameter_sync(self): def update_weights_from_buckets(self, buckets: List[Optional['BucketInfo']]): """Used for Mcore2SGLang Parameter Sync """ - # from sglang.srt.patch_torch import monkey_patch_torch_reductions - # monkey_patch_torch_reductions() + if PkgVersion(sglang.__version__)>=PkgVersion('0.5.3'): + from sglang.srt.utils.patch_torch import monkey_patch_torch_reductions + else: + from sglang.srt.patch_torch import monkey_patch_torch_reductions + monkey_patch_torch_reductions() + param_id_to_update = set() for bucket in buckets: if bucket is None: From 556c152642d2320fadee38766f451a167c5fb55c Mon Sep 17 00:00:00 2001 From: zouxinyi0625 Date: Wed, 15 Oct 2025 13:55:08 +0800 Subject: [PATCH 07/17] update doc --- .../en/tutorial/tutorial_grpo_fsdp_qwen3vl.md | 71 ++++++++++++++++++ .../zh/tutorial/tutorial_grpo_fsdp_qwen3vl.md | 73 +++++++++++++++++++ .../train_fsdp_sglang_qwen3_vl_8b_grpo.sh | 50 +++++++++++++ 3 files changed, 194 insertions(+) create mode 100644 docs/en/tutorial/tutorial_grpo_fsdp_qwen3vl.md create mode 100644 docs/zh/tutorial/tutorial_grpo_fsdp_qwen3vl.md create mode 100644 scripts/fsdp_sglang/train_fsdp_sglang_qwen3_vl_8b_grpo.sh diff --git a/docs/en/tutorial/tutorial_grpo_fsdp_qwen3vl.md b/docs/en/tutorial/tutorial_grpo_fsdp_qwen3vl.md new file mode 100644 index 00000000..18c5680d --- /dev/null +++ b/docs/en/tutorial/tutorial_grpo_fsdp_qwen3vl.md @@ -0,0 +1,71 @@ +# Qwen3-VL End-to-End GRPO Training Tutorial with FSDP + +This document provides instructions for end-to-end training using the ChatLearn, pytorch FSDP and vLLM framework, and the qwen3vl-8b model. + +## Environment Setup +1. Docker Image Preparation + +We recommend running the following example in PAI [DSW](https://help.aliyun.com/zh/pai/user-guide/create-and-manage-dsw-instances/)/[DLC]( https://help.aliyun.com/zh/pai/user-guide/create-a-training-task). You need to use the following image to launch the instance. +```bash +dsw-registry.cn-shanghai.cr.aliyuncs.com/pai-training-algorithm/chatlearn:torch2.8.0-sglang0.5.3-ubuntu24.04-cuda12.6-py312 +``` + +You can use a VPC address to accelerate image pulling. The image address should be adjusted based on the current region. For example, if you need to launch a DSW instance in Shanghai, you can use the following image `dsw-registry-vpc.cn-shanghai.cr.aliyuncs.com/pai-training-algorithm/chatlearn:torch2.8.0-sglang0.5.3-ubuntu24.04-cuda12.6-py312`. + +2. Code Preparation + +```bash +git clone https://github.com/alibaba/ChatLearn.git && cd ChatLearn +``` + +## Data Preparation +We take [geo3k](https://hf-mirror.com/datasets/hiyouga/geometry3k) as exmaple. +```bash +# download dataset +mkdir -p dataset +export HF_ENDPOINT=https://hf-mirror.com + +# data process +python chatlearn/data/data_preprocess/geo3k.py +``` + +## Training +You can run the following command to start training: + +### Qwen3VL-8B +Run this command on server with 8 GPUs +MOE model is also supported +```bash +# download model weight +modelscope download --model Qwen/Qwen3-VL-8B-Instruct --local_dir pretrained_models/Qwen3-VL-8B-Instruct + +bash scripts/fsdp_sglang/train_fsdp_sglang_qwen3_vl_8b_grpo.sh +``` + +## Using Wandb +If you want to use Wandb to log the training process, you need to modify the configuration with: +```bash +export WANDB_API_KEY="Your-Wandb-api-key" +``` +Change the configuration to: +```bash +runtime_args.log_args_dict.enable_wandb=True +runtime_args.log_args_dict.wandb_project="Your-Wandb-Project-Name" +``` + +## Model Conversion +Saving FSDP models is time-consuming. Chatlearn provides an offline model conversion feature, which converts FSDP-sharded checkpoints back to HuggingFace format. The script is as follows: +```bash +export CHATLEARN=$(pwd) +python chatlearn/offline_ckpt_converter.py \ + --hf_dir ${CHATLEARN}/Qwen3-VL-8B-Instruct/ \ + --ckpt_dir ${CHATLEARN}/output/qwen3vl-grpo-8b/save_model/policy_trainer \ + --save_dir ${CHATLEARN}/output/qwen3vl-grpo-8b/save_model/huggingface/ \ + --iter 200 \ + --groupgemm 0 +``` +If you are training an MoE model with groupgemm, please make sure to set: +```bash + --groupgemm 1 +``` +This script will convert the final FSDP sharded model after training back into a HuggingFace model and save it in the path "${CHATLEARN}/output/qwen3vl-grpo-8b/save_model/huggingface/". \ No newline at end of file diff --git a/docs/zh/tutorial/tutorial_grpo_fsdp_qwen3vl.md b/docs/zh/tutorial/tutorial_grpo_fsdp_qwen3vl.md new file mode 100644 index 00000000..f3acc2ee --- /dev/null +++ b/docs/zh/tutorial/tutorial_grpo_fsdp_qwen3vl.md @@ -0,0 +1,73 @@ +# 基于 FSDP 的端到端 Qwen35VL GRPO训练流程 + +本文档提供使用 ChatLearn、PyTorch FSDP 和 vLLM 框架来对Qwen3-VL模型进行GRPO训练的快速开始指南。 + +## 环境配置 +1. Docker镜像准备 +我们建议在PAI [DSW](https://help.aliyun.com/zh/pai/user-guide/create-and-manage-dsw-instances/)/[DLC](https://help.aliyun.com/zh/pai/user-guide/create-a-training-task)中运行该示例,你需要填写如下镜像地址来启动实例: +```bash +dsw-registry.cn-shanghai.cr.aliyuncs.com/pai-training-algorithm/chatlearn:torch2.8.0-sglang0.5.3-ubuntu24.04-cuda12.6-py312 +``` + +可以使用vpc地址来加速镜像拉取速度,需要根据当前region信息来更改镜像地址。比如,启动在上海的DSW实例,可以使用如下镜像`dsw-registry-vpc.cn-shanghai.cr.aliyuncs.com/pai-training-algorithm/chatlearn:torch2.8.0-sglang0.5.3-ubuntu24.04-cuda12.6-py312`。 + +2. 代码准备 + +```bash +git clone https://github.com/alibaba/ChatLearn.git && cd ChatLearn +``` + +## 数据准备 + +以[geo3k](https://hf-mirror.com/datasets/hiyouga/geometry3k)数据集作为示例. +```bash +# 下载数据集 +mkdir -p dataset + +export HF_ENDPOINT=https://hf-mirror.com + +# 数据集预处理 +python chatlearn/data/data_preprocess/geo3k.py +``` + +## 训练 +运行以下命令开始训练: + +### Qwen3VL-8B +8卡机器运行如下命令 +MOE 模型也同样支持 +```bash +# 下载模型权重 +modelscope download --model Qwen/Qwen3-VL-8B-Instruct --local_dir pretrained_models/Qwen3-VL-8B-Instruct + +bash scripts/fsdp_sglang/train_fsdp_sglang_qwen3_vl_8b_grpo.sh +``` + +## 使用 Wandb 监控 +如需使用 Wandb 记录训练过程,请修改对应脚本中的配置: + +```bash +export WANDB_API_KEY="Your-Wandb-api-key" +``` +将配置项改为: +```bash +runtime_args.log_args_dict.enable_wandb=True +runtime_args.log_args_dict.wandb_project="Your-Wandb-Project-Name" +``` + +## 模型转化 +FSDP模型保存耗时较高,Chatlearn提供了离线模型转化功能,将FSDP保存的切片模型转化回huggingface模型。脚本如下: +```bash +export CHATLEARN=$(pwd) +python chatlearn/offline_ckpt_converter.py \ + --hf_dir ${CHATLEARN}/Qwen3-VL-8B-Instruct/ \ + --ckpt_dir ${CHATLEARN}/output/qwen3vl-grpo-8b/save_model/policy_trainer \ + --save_dir ${CHATLEARN}/output/qwen3vl-grpo-8b/save_model/huggingface/ \ + --iter 200 \ + --groupgemm 0 +``` +如果你使用groupgemm优化的moe模型训练,请确保设置: +```bash + --groupgemm 1 +``` +这段脚本会将训练完成后的最后一个FSDP切片模型转化回HF模型,并保存在"${CHATLEARN}/output/qwen3vl-grpo-8b/save_model/huggingface/"路径下 diff --git a/scripts/fsdp_sglang/train_fsdp_sglang_qwen3_vl_8b_grpo.sh b/scripts/fsdp_sglang/train_fsdp_sglang_qwen3_vl_8b_grpo.sh new file mode 100644 index 00000000..3bdc63cb --- /dev/null +++ b/scripts/fsdp_sglang/train_fsdp_sglang_qwen3_vl_8b_grpo.sh @@ -0,0 +1,50 @@ +#!/bin/bash + +# Tested on 8xH20-3e with 140G VRAM +set -x + +export CHATLEARN=$(pwd) +export PYTHONPATH=${CHATLEARN}:${PYTHONPATH} +source scripts/base_env.sh +export RAY_DEDUP_LOGS=1 +export exp_name=qwen3-vl-grpo-8b-sglang + +python chatlearn/entrypoint.py grpo \ + --config-file template/grpo_fsdp.yaml \ + runtime_args.exp_name=${exp_name} \ + runtime_args.rollout_backend=sglang \ + runtime_args.model_type=vlm \ + runtime_args.data_path=${CHATLEARN}/dataset/geo3k/train.parquet \ + runtime_args.eval_data_path=${CHATLEARN}/dataset/geo3k/test.parquet \ + runtime_args.output_dir=${CHATLEARN}/output/${exp_name} \ + runtime_args.num_episode=200 \ + runtime_args.sample_per_episode=512 \ + runtime_args.train_global_batch_size=512 \ + runtime_args.train_micro_batch_size=8 \ + runtime_args.save_episode_interval=5 \ + runtime_args.eval_episode_interval=5 \ + runtime_args.enable_eval_before_training=False \ + runtime_args.log_args_dict.enable_wandb=False \ + runtime_args.log_args_dict.wandb_project=your_wandb_project \ + models.policy_trainer.num_gpu=${num_device} \ + models.policy_trainer.packing=True \ + models.policy_trainer.meta_init=False \ + models.policy_trainer.groupgemm=False \ + models.policy_trainer.generation_batch_size=64 \ + models.policy_trainer.ulysses_sequence_parallel_size=1 \ + models.policy_trainer.load=${CHATLEARN}/pretrained_models/Qwen3-VL-8B-Instruct/ \ + models.policy_trainer.optimizer.lr=1e-6 \ + models.policy_trainer.pos_clip_ratio=0.2 \ + models.policy_trainer.neg_clip_ratio=0.2 \ + models.policy_trainer.kl_coef=0.01 \ + models.ref_policy.generation_batch_size=64 \ + models.policy.generation_batch_size=64 \ + models.policy.enforce_eager=False \ + models.policy.tensor_model_parallel_size=1 \ + models.policy.max_prompt_tokens_length=1024 \ + models.policy.max_response_tokens_length=2048 \ + models.policy.num_inference_per_prompt=4 \ + models.policy.gpu_memory_utilization=0.85 \ + models.policy.enable_thinking=False \ + models.reward.generation_batch_size=256 \ + 2>&1 | tee log_${exp_name}.log ; exit ${PIPESTATUS[0]} From 0f56fff74772eaeb38665f255b631048cc065dcc Mon Sep 17 00:00:00 2001 From: zouxinyi0625 Date: Wed, 15 Oct 2025 14:10:32 +0800 Subject: [PATCH 08/17] update --- chatlearn/models/sglang_module.py | 8 ++++++-- docs/en/index.rst | 1 + docs/zh/index.rst | 1 + 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/chatlearn/models/sglang_module.py b/chatlearn/models/sglang_module.py index a85c5d01..1ded1602 100644 --- a/chatlearn/models/sglang_module.py +++ b/chatlearn/models/sglang_module.py @@ -770,8 +770,12 @@ async def update_weights_from_ipc_handles(self, reduce_data): @torch.no_grad() async def update_weights_from_buckets(self, buckets: List[Optional['BucketInfo']]): - # from sglang.srt.patch_torch import monkey_patch_torch_reductions - # monkey_patch_torch_reductions() + if PkgVersion(sglang.__version__)>=PkgVersion('0.5.3'): + from sglang.srt.utils.patch_torch import monkey_patch_torch_reductions + else: + from sglang.srt.patch_torch import monkey_patch_torch_reductions + monkey_patch_torch_reductions() + param_id_to_update = set() for bucket in buckets: if bucket is None: diff --git a/docs/en/index.rst b/docs/en/index.rst index ed4dc7bc..c9c2b71a 100644 --- a/docs/en/index.rst +++ b/docs/en/index.rst @@ -24,6 +24,7 @@ ChatLearn Documentation tutorial/tutorial_grpo_fsdp_qwenvl tutorial/tutorial_grpo_mcore_qwenvl tutorial/tutorial_grpo_fsdp_sglang_agent + tutorial/tutorial_grpo_fsdp_qwen3vl tutorial/multinode_train tutorial/continue_train tutorial/tuning_guide diff --git a/docs/zh/index.rst b/docs/zh/index.rst index ec66eb5e..793d4ace 100644 --- a/docs/zh/index.rst +++ b/docs/zh/index.rst @@ -24,6 +24,7 @@ ChatLearn 使用文档 tutorial/tutorial_grpo_mcore tutorial/tutorial_grpo_fsdp_qwenvl tutorial/tutorial_grpo_mcore_qwenvl + tutorial/tutorial_grpo_fsdp_qwen3vl tutorial/tutorial_grpo_fsdp_sglang_agent tutorial/multinode_train tutorial/continue_train From 42eefe12ae14a3d5f036fe211e8dee4795b2dd7e Mon Sep 17 00:00:00 2001 From: zouxinyi0625 Date: Thu, 16 Oct 2025 15:41:36 +0800 Subject: [PATCH 09/17] change rope index --- .../algorithm/grpo_utils/policy_trainer.py | 19 +++++++++++++ chatlearn/data/vl_prompt_dataset.py | 23 +++++++-------- chatlearn/models/agent/agent_module.py | 7 +++-- chatlearn/models/agent/base_agent_graph.py | 28 +++++++++---------- 4 files changed, 49 insertions(+), 28 deletions(-) diff --git a/chatlearn/algorithm/grpo_utils/policy_trainer.py b/chatlearn/algorithm/grpo_utils/policy_trainer.py index b5d17fd4..46164996 100644 --- a/chatlearn/algorithm/grpo_utils/policy_trainer.py +++ b/chatlearn/algorithm/grpo_utils/policy_trainer.py @@ -117,6 +117,7 @@ def preprocess_data_list(self, data_list: List[Dict[str, Any]], training: bool): tokens_, indices, *_ = unpad_input(tokens_.unsqueeze(-1).cuda(), attn_mask.cuda()) tokens_ = tokens_.permute(1,0).cpu() # For compatible with transformers position_ids, *_ = unpad_input(position_ids.unsqueeze(-1).cuda(), attn_mask.cuda()) + if self.runtime_args.model_type == 'vlm': # vl position_ids = position_ids.permute(0, 2, 1).cpu() @@ -168,6 +169,20 @@ def preprocess_data_list(self, data_list: List[Dict[str, Any]], training: bool): data_after_process.append(data_obj) return response_token_length_total, data_after_process + + def compute_vl_position_ids(self, data_list: List[Dict[str, Any]]): + input_ids_key = 'input_ids' if 'input_ids' in data_list[0] else 'prompt_token_ids' + + for data_b in data_list: + position_ids, _ = self.model.model.get_rope_index( + input_ids=torch.tensor(data_b[input_ids_key]).unsqueeze(0), + image_grid_thw=data_b["image_grid_thw"], + attention_mask=torch.tensor(data_b['attention_mask']).unsqueeze(0) + ) + data_b['position_ids'] = position_ids.squeeze().tolist() + + return data_list + @monitor_error() @compute_decorator(trainable=True, rollout=False) @timeit() @@ -298,8 +313,12 @@ def train_step(self, data_list: List[Dict[str, Any]], **kwargs): # pylint: disab @compute_decorator(trainable=False, rollout=False) @timeit() def forward_step(self, data: List[Dict[str, Any]], **kwargs) -> List[Dict[str, Any]]: # pylint: disable=unused-argument,arguments-differ + if self.runtime_args.model_type == 'vlm': + data = self.compute_vl_position_ids(data) + _, data_list = self.preprocess_data_list(data_list=data, training=False) tag = "old_logprobs" if self.trainable else "ref_logprobs" + # Logprobs holder for inputs in data_list: for k, v in inputs.items(): diff --git a/chatlearn/data/vl_prompt_dataset.py b/chatlearn/data/vl_prompt_dataset.py index 0d4e69b9..662f0ba7 100644 --- a/chatlearn/data/vl_prompt_dataset.py +++ b/chatlearn/data/vl_prompt_dataset.py @@ -6,7 +6,7 @@ from transformers import AutoTokenizer, AutoProcessor from qwen_vl_utils import process_vision_info -from chatlearn.models.patches.transformers.qwen2_5_vl_patch import get_rope_index +# from chatlearn.models.patches.transformers.qwen2_5_vl_patch import get_rope_index class PromptPipeline(Dataset): @@ -99,14 +99,14 @@ def __init__( # text only input_ids for vllm raw_input_ids = self.tokenizer.encode(raw_prompt, add_special_tokens=False) # get position_ids used for sequence packing - position_ids, _ = get_rope_index( - self.processor, - input_ids=input_ids, - image_grid_thw=model_inputs.get("image_grid_thw"), - video_grid_thw=model_inputs.get("video_grid_thw"), - second_per_grid_ts=model_inputs.get("second_per_grid_ts"), - attention_mask=attention_mask, - ) + # position_ids, _ = get_rope_index( + # self.processor, + # input_ids=input_ids, + # image_grid_thw=model_inputs.get("image_grid_thw"), + # video_grid_thw=model_inputs.get("video_grid_thw"), + # second_per_grid_ts=model_inputs.get("second_per_grid_ts"), + # attention_mask=attention_mask, + # ) # for vl model, raw_input_ids is only text input_ids for vllm inference # input_ids is used for model forward_step and sglang inference (with image pad) @@ -116,11 +116,12 @@ def __init__( "input_ids": input_ids[0].tolist(), "prompt_token_length": len(input_ids[0].tolist()), "prompt": raw_prompt, - "position_ids": position_ids.squeeze().tolist(), + # "position_ids": position_ids.squeeze().tolist(), "multi_modal_data": multi_modal_data, "mm_processor_kwargs": mm_processor_kwargs, "pixel_values": pixel_values, - "image_grid_thw": image_grid_thw + "image_grid_thw": image_grid_thw, + "attention_mask": attention_mask[0].tolist(), }) if len(input_ids[0]) > self.max_prompt: self.max_prompt = len(input_ids[0]) diff --git a/chatlearn/models/agent/agent_module.py b/chatlearn/models/agent/agent_module.py index 98aceda9..7daae1a9 100644 --- a/chatlearn/models/agent/agent_module.py +++ b/chatlearn/models/agent/agent_module.py @@ -89,7 +89,9 @@ def postprocess_func( prompt_token_ids = output.prompt_ids pixel_values = output.pixel_values image_grid_thw = output.image_grid_thw - position_ids = output.position_ids + attentiion_mask = output.attention_mask + + # position_ids = output.position_ids response_token_length = len(output.all_token_ids) - len(output.prompt_ids) prompt_token_length = len(output.prompt_ids) str_outputs = output.str_output @@ -108,7 +110,8 @@ def postprocess_func( # multimodel related "pixel_values": pixel_values, "image_grid_thw": image_grid_thw, - "position_ids": position_ids + "attention_mask": attentiion_mask + # "position_ids": position_ids } ) data_output.append(input_data) diff --git a/chatlearn/models/agent/base_agent_graph.py b/chatlearn/models/agent/base_agent_graph.py index f6532220..df109a50 100644 --- a/chatlearn/models/agent/base_agent_graph.py +++ b/chatlearn/models/agent/base_agent_graph.py @@ -26,9 +26,7 @@ find_last_ai_index, find_first_ai_index) from chatlearn.models.sglang_module import AsyncEngine -from chatlearn.models.patches.transformers.qwen2_5_vl_patch import get_rope_index - - +# from chatlearn.models.patches.transformers.qwen2_5_vl_patch import get_rope_index def find_first_zero_group_end(lst): for i, x in enumerate(lst): if x != 0: @@ -55,7 +53,7 @@ class AgentGraphOutput(BaseModel): # multimodel related item pixel_values: Any = None image_grid_thw: Any = None - position_ids: Any = None + attention_mask: Any = None # Extra fields for dynamic addition. extra_fields: dict[str, Any] = {} @@ -108,21 +106,21 @@ def convert_agent_graph_output(self, messages: Dict) -> AgentGraphOutput: num_turns = last_ai_message_idx + 1 str_output = self.tokenizer.decode(all_token_ids[prompt_end_idx + 1 :]) - pixel_values, image_grid_thw, position_ids = None, None, None + pixel_values, image_grid_thw = None, None multimodel_batch_feature = messages[first_ai_message_idx].response_metadata.get("multimodel_batch_feature", None) if multimodel_batch_feature: pixel_values = multimodel_batch_feature.get("pixel_values") image_grid_thw = multimodel_batch_feature.get("image_grid_thw") # need to get position ids used in sequence packing - position_ids, _ = get_rope_index( - self.processor, - input_ids=multimodel_batch_feature.get("input_ids"), - image_grid_thw=multimodel_batch_feature.get("image_grid_thw"), - video_grid_thw=multimodel_batch_feature.get("video_grid_thw"), - second_per_grid_ts=multimodel_batch_feature.get("second_per_grid_ts"), - attention_mask=multimodel_batch_feature.get("attention_mask"), - ) - position_ids = position_ids.squeeze().tolist() + # position_ids, _ = get_rope_index( + # self.processor, + # input_ids=multimodel_batch_feature.get("input_ids"), + # image_grid_thw=multimodel_batch_feature.get("image_grid_thw"), + # video_grid_thw=multimodel_batch_feature.get("video_grid_thw"), + # second_per_grid_ts=multimodel_batch_feature.get("second_per_grid_ts"), + # attention_mask=multimodel_batch_feature.get("attention_mask"), + # ) + # position_ids = position_ids.squeeze().tolist() return AgentGraphOutput( str_output=str_output, @@ -132,5 +130,5 @@ def convert_agent_graph_output(self, messages: Dict) -> AgentGraphOutput: num_turns=num_turns, pixel_values=pixel_values, image_grid_thw=image_grid_thw, - position_ids=position_ids + attention_mask=multimodel_batch_feature.get("attention_mask")[0].tolist() ) From 782130b3bd63fffd87efe7fc3b952bceafcec7e0 Mon Sep 17 00:00:00 2001 From: zouxinyi0625 Date: Thu, 16 Oct 2025 16:40:57 +0800 Subject: [PATCH 10/17] update --- chatlearn/algorithm/grpo_utils/policy_trainer.py | 3 +-- chatlearn/models/agent/agent_module.py | 2 +- chatlearn/models/agent/base_agent_graph.py | 4 +++- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/chatlearn/algorithm/grpo_utils/policy_trainer.py b/chatlearn/algorithm/grpo_utils/policy_trainer.py index 46164996..27088d18 100644 --- a/chatlearn/algorithm/grpo_utils/policy_trainer.py +++ b/chatlearn/algorithm/grpo_utils/policy_trainer.py @@ -169,7 +169,6 @@ def preprocess_data_list(self, data_list: List[Dict[str, Any]], training: bool): data_after_process.append(data_obj) return response_token_length_total, data_after_process - def compute_vl_position_ids(self, data_list: List[Dict[str, Any]]): input_ids_key = 'input_ids' if 'input_ids' in data_list[0] else 'prompt_token_ids' @@ -315,7 +314,7 @@ def train_step(self, data_list: List[Dict[str, Any]], **kwargs): # pylint: disab def forward_step(self, data: List[Dict[str, Any]], **kwargs) -> List[Dict[str, Any]]: # pylint: disable=unused-argument,arguments-differ if self.runtime_args.model_type == 'vlm': data = self.compute_vl_position_ids(data) - + _, data_list = self.preprocess_data_list(data_list=data, training=False) tag = "old_logprobs" if self.trainable else "ref_logprobs" diff --git a/chatlearn/models/agent/agent_module.py b/chatlearn/models/agent/agent_module.py index 7daae1a9..abdced17 100644 --- a/chatlearn/models/agent/agent_module.py +++ b/chatlearn/models/agent/agent_module.py @@ -110,7 +110,7 @@ def postprocess_func( # multimodel related "pixel_values": pixel_values, "image_grid_thw": image_grid_thw, - "attention_mask": attentiion_mask + "attention_mask": attentiion_mask, # "position_ids": position_ids } ) diff --git a/chatlearn/models/agent/base_agent_graph.py b/chatlearn/models/agent/base_agent_graph.py index df109a50..081a0b7c 100644 --- a/chatlearn/models/agent/base_agent_graph.py +++ b/chatlearn/models/agent/base_agent_graph.py @@ -54,6 +54,7 @@ class AgentGraphOutput(BaseModel): pixel_values: Any = None image_grid_thw: Any = None attention_mask: Any = None + # position_ids: Any = None # Extra fields for dynamic addition. extra_fields: dict[str, Any] = {} @@ -130,5 +131,6 @@ def convert_agent_graph_output(self, messages: Dict) -> AgentGraphOutput: num_turns=num_turns, pixel_values=pixel_values, image_grid_thw=image_grid_thw, - attention_mask=multimodel_batch_feature.get("attention_mask")[0].tolist() + attention_mask=multimodel_batch_feature.get("attention_mask")[0].tolist(), + # position_ids=position_ids ) From 882145fd88ca62f8d1ca46e5f039302226a53029 Mon Sep 17 00:00:00 2001 From: zouxinyi0625 Date: Thu, 16 Oct 2025 16:50:32 +0800 Subject: [PATCH 11/17] update --- .../algorithm/grpo_utils/policy_trainer.py | 2 -- chatlearn/data/vl_prompt_dataset.py | 15 +-------------- chatlearn/models/agent/agent_module.py | 4 +--- chatlearn/models/agent/base_agent_graph.py | 18 +++--------------- 4 files changed, 5 insertions(+), 34 deletions(-) diff --git a/chatlearn/algorithm/grpo_utils/policy_trainer.py b/chatlearn/algorithm/grpo_utils/policy_trainer.py index 27088d18..d25ff0f4 100644 --- a/chatlearn/algorithm/grpo_utils/policy_trainer.py +++ b/chatlearn/algorithm/grpo_utils/policy_trainer.py @@ -117,7 +117,6 @@ def preprocess_data_list(self, data_list: List[Dict[str, Any]], training: bool): tokens_, indices, *_ = unpad_input(tokens_.unsqueeze(-1).cuda(), attn_mask.cuda()) tokens_ = tokens_.permute(1,0).cpu() # For compatible with transformers position_ids, *_ = unpad_input(position_ids.unsqueeze(-1).cuda(), attn_mask.cuda()) - if self.runtime_args.model_type == 'vlm': # vl position_ids = position_ids.permute(0, 2, 1).cpu() @@ -317,7 +316,6 @@ def forward_step(self, data: List[Dict[str, Any]], **kwargs) -> List[Dict[str, A _, data_list = self.preprocess_data_list(data_list=data, training=False) tag = "old_logprobs" if self.trainable else "ref_logprobs" - # Logprobs holder for inputs in data_list: for k, v in inputs.items(): diff --git a/chatlearn/data/vl_prompt_dataset.py b/chatlearn/data/vl_prompt_dataset.py index 662f0ba7..2f77ff0b 100644 --- a/chatlearn/data/vl_prompt_dataset.py +++ b/chatlearn/data/vl_prompt_dataset.py @@ -6,9 +6,6 @@ from transformers import AutoTokenizer, AutoProcessor from qwen_vl_utils import process_vision_info -# from chatlearn.models.patches.transformers.qwen2_5_vl_patch import get_rope_index - - class PromptPipeline(Dataset): """ Input data_list: List[Dict]) @@ -98,16 +95,7 @@ def __init__( # text only input_ids for vllm raw_input_ids = self.tokenizer.encode(raw_prompt, add_special_tokens=False) - # get position_ids used for sequence packing - # position_ids, _ = get_rope_index( - # self.processor, - # input_ids=input_ids, - # image_grid_thw=model_inputs.get("image_grid_thw"), - # video_grid_thw=model_inputs.get("video_grid_thw"), - # second_per_grid_ts=model_inputs.get("second_per_grid_ts"), - # attention_mask=attention_mask, - # ) - + # for vl model, raw_input_ids is only text input_ids for vllm inference # input_ids is used for model forward_step and sglang inference (with image pad) # sglang support both input_ids and raw_input_ids but to merge in all_tokens, input_ids is used @@ -116,7 +104,6 @@ def __init__( "input_ids": input_ids[0].tolist(), "prompt_token_length": len(input_ids[0].tolist()), "prompt": raw_prompt, - # "position_ids": position_ids.squeeze().tolist(), "multi_modal_data": multi_modal_data, "mm_processor_kwargs": mm_processor_kwargs, "pixel_values": pixel_values, diff --git a/chatlearn/models/agent/agent_module.py b/chatlearn/models/agent/agent_module.py index abdced17..ab74b368 100644 --- a/chatlearn/models/agent/agent_module.py +++ b/chatlearn/models/agent/agent_module.py @@ -91,7 +91,6 @@ def postprocess_func( image_grid_thw = output.image_grid_thw attentiion_mask = output.attention_mask - # position_ids = output.position_ids response_token_length = len(output.all_token_ids) - len(output.prompt_ids) prompt_token_length = len(output.prompt_ids) str_outputs = output.str_output @@ -110,8 +109,7 @@ def postprocess_func( # multimodel related "pixel_values": pixel_values, "image_grid_thw": image_grid_thw, - "attention_mask": attentiion_mask, - # "position_ids": position_ids + "attention_mask": attentiion_mask } ) data_output.append(input_data) diff --git a/chatlearn/models/agent/base_agent_graph.py b/chatlearn/models/agent/base_agent_graph.py index 081a0b7c..b9f7a9c6 100644 --- a/chatlearn/models/agent/base_agent_graph.py +++ b/chatlearn/models/agent/base_agent_graph.py @@ -26,7 +26,6 @@ find_last_ai_index, find_first_ai_index) from chatlearn.models.sglang_module import AsyncEngine -# from chatlearn.models.patches.transformers.qwen2_5_vl_patch import get_rope_index def find_first_zero_group_end(lst): for i, x in enumerate(lst): if x != 0: @@ -54,7 +53,6 @@ class AgentGraphOutput(BaseModel): pixel_values: Any = None image_grid_thw: Any = None attention_mask: Any = None - # position_ids: Any = None # Extra fields for dynamic addition. extra_fields: dict[str, Any] = {} @@ -107,21 +105,12 @@ def convert_agent_graph_output(self, messages: Dict) -> AgentGraphOutput: num_turns = last_ai_message_idx + 1 str_output = self.tokenizer.decode(all_token_ids[prompt_end_idx + 1 :]) - pixel_values, image_grid_thw = None, None + pixel_values, image_grid_thw, attention_mask = None, None, None multimodel_batch_feature = messages[first_ai_message_idx].response_metadata.get("multimodel_batch_feature", None) if multimodel_batch_feature: pixel_values = multimodel_batch_feature.get("pixel_values") image_grid_thw = multimodel_batch_feature.get("image_grid_thw") - # need to get position ids used in sequence packing - # position_ids, _ = get_rope_index( - # self.processor, - # input_ids=multimodel_batch_feature.get("input_ids"), - # image_grid_thw=multimodel_batch_feature.get("image_grid_thw"), - # video_grid_thw=multimodel_batch_feature.get("video_grid_thw"), - # second_per_grid_ts=multimodel_batch_feature.get("second_per_grid_ts"), - # attention_mask=multimodel_batch_feature.get("attention_mask"), - # ) - # position_ids = position_ids.squeeze().tolist() + attention_mask = multimodel_batch_feature.get("attention_mask")[0].tolist() return AgentGraphOutput( str_output=str_output, @@ -131,6 +120,5 @@ def convert_agent_graph_output(self, messages: Dict) -> AgentGraphOutput: num_turns=num_turns, pixel_values=pixel_values, image_grid_thw=image_grid_thw, - attention_mask=multimodel_batch_feature.get("attention_mask")[0].tolist(), - # position_ids=position_ids + attention_mask=attention_mask ) From 457fa45c699c3b582ac8cf79ea34a5ed90ef88ac Mon Sep 17 00:00:00 2001 From: zouxinyi0625 Date: Fri, 17 Oct 2025 11:40:47 +0800 Subject: [PATCH 12/17] 25vl support transformers>=4.55.0 --- chatlearn/algorithm/grpo_utils/policy_trainer.py | 7 +++++-- chatlearn/data/vl_prompt_dataset.py | 2 +- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/chatlearn/algorithm/grpo_utils/policy_trainer.py b/chatlearn/algorithm/grpo_utils/policy_trainer.py index d25ff0f4..b636cb25 100644 --- a/chatlearn/algorithm/grpo_utils/policy_trainer.py +++ b/chatlearn/algorithm/grpo_utils/policy_trainer.py @@ -22,6 +22,8 @@ import torch.distributed as dist import torch.nn.functional as F from flash_attn.bert_padding import pad_input +from packaging.version import Version as PkgVersion +import transformers from chatlearn import FSDPModule from chatlearn.utils import to_device @@ -37,7 +39,6 @@ split_and_unpadding, unpad_input) - class PolicyTrainer(FSDPModule): """policy trainer""" def setup(self): @@ -120,6 +121,8 @@ def preprocess_data_list(self, data_list: List[Dict[str, Any]], training: bool): if self.runtime_args.model_type == 'vlm': # vl position_ids = position_ids.permute(0, 2, 1).cpu() + if PkgVersion(transformers.__version__)>=PkgVersion('4.55.0'): + position_ids = torch.cat([position_ids[0:1], position_ids], dim=0) # add text position_ids for vl else: position_ids = position_ids.permute(1, 0).cpu() # For compatible with transformers @@ -313,7 +316,7 @@ def train_step(self, data_list: List[Dict[str, Any]], **kwargs): # pylint: disab def forward_step(self, data: List[Dict[str, Any]], **kwargs) -> List[Dict[str, Any]]: # pylint: disable=unused-argument,arguments-differ if self.runtime_args.model_type == 'vlm': data = self.compute_vl_position_ids(data) - + _, data_list = self.preprocess_data_list(data_list=data, training=False) tag = "old_logprobs" if self.trainable else "ref_logprobs" # Logprobs holder diff --git a/chatlearn/data/vl_prompt_dataset.py b/chatlearn/data/vl_prompt_dataset.py index 2f77ff0b..11101d31 100644 --- a/chatlearn/data/vl_prompt_dataset.py +++ b/chatlearn/data/vl_prompt_dataset.py @@ -95,7 +95,7 @@ def __init__( # text only input_ids for vllm raw_input_ids = self.tokenizer.encode(raw_prompt, add_special_tokens=False) - + # for vl model, raw_input_ids is only text input_ids for vllm inference # input_ids is used for model forward_step and sglang inference (with image pad) # sglang support both input_ids and raw_input_ids but to merge in all_tokens, input_ids is used From 8fb43a608957852cf9861efbe42ff42a1c095622 Mon Sep 17 00:00:00 2001 From: zouxinyi0625 Date: Fri, 17 Oct 2025 11:45:45 +0800 Subject: [PATCH 13/17] update --- chatlearn/data/vl_prompt_dataset.py | 1 + 1 file changed, 1 insertion(+) diff --git a/chatlearn/data/vl_prompt_dataset.py b/chatlearn/data/vl_prompt_dataset.py index 11101d31..b97d1c38 100644 --- a/chatlearn/data/vl_prompt_dataset.py +++ b/chatlearn/data/vl_prompt_dataset.py @@ -38,6 +38,7 @@ class PromptPipeline(Dataset): "mm_processor_kwargs": {'fps':[]}, # used for video useless now "pixel_values": Tensor, # [grid_num, pixel_num] "image_grid_thw": Tensor, # [1,3] 3 means t,h,w + "attention_mask": List, used for compute position_ids } """ def __init__( From fdfd93c381b34b5f88751a6467d42a2bd1137b26 Mon Sep 17 00:00:00 2001 From: zouxinyi0625 Date: Fri, 17 Oct 2025 13:24:10 +0800 Subject: [PATCH 14/17] remove doc --- docker/torch/Dockerfile.torch2.8.0.sglang053 | 15 ++++ docs/en/index.rst | 1 - .../en/tutorial/tutorial_grpo_fsdp_qwen3vl.md | 71 ------------------ docs/zh/index.rst | 1 - docs/zh/installation.md | 4 +- .../zh/tutorial/tutorial_grpo_fsdp_qwen3vl.md | 73 ------------------- docs/zh/tutorial/tutorial_grpo_fsdp_qwenvl.md | 5 +- 7 files changed, 20 insertions(+), 150 deletions(-) create mode 100644 docker/torch/Dockerfile.torch2.8.0.sglang053 delete mode 100644 docs/en/tutorial/tutorial_grpo_fsdp_qwen3vl.md delete mode 100644 docs/zh/tutorial/tutorial_grpo_fsdp_qwen3vl.md diff --git a/docker/torch/Dockerfile.torch2.8.0.sglang053 b/docker/torch/Dockerfile.torch2.8.0.sglang053 new file mode 100644 index 00000000..085e044d --- /dev/null +++ b/docker/torch/Dockerfile.torch2.8.0.sglang053 @@ -0,0 +1,15 @@ +FROM nvcr.io/nvidia/pytorch:24.12-py3 +ENV PIP_INDEX_URL=https://mirrors.aliyun.com/pypi/simple/ +ENV PIP_TRUSTED_HOST=mirrors.aliyun.com +RUN pip install --no-cache-dir "sglang[all]==0.5.3.post1" +RUN pip install --no-cache-dir transformers==4.57.0 +RUN pip install --no-cache-dir langgraph==0.6.6 +RUN pip install --no-cache-dir ray[default]==2.46.0 +RUN pip install --no-cache-dir accelerate==1.10.0 +RUN pip install --no-cache-dir wandb==0.19.3 +RUN pip install --no-cache-dir hydra-core==1.3.2 +RUN pip install --no-cache-dir grpcio==1.70.0 nvidia-modelopt==0.27.0 nvidia-modelopt-core==0.27.0 datasets==3.6.0 deepspeed==0.16.7 +RUN pip install --no-cache-dir mathruler==0.1.0 pylatexenc==2.10 qwen-vl-utils==0.0.14 +RUN pip uninstall -y flash_attn && pip install --no-cache-dir https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/csrc/flash-attention/torch2.8.0-cu12x/flash_attn-2.7.4.post1-cp312-cp312-linux_x86_64.whl +RUN pip uninstall -y transformer_engine && pip install --no-cache-dir https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/csrc/transformer_engine/torch2.8.0-cuda12x/transformer_engine-2.3.0%2B5de3e148-cp312-cp312-linux_x86_64.whl +RUN pip uninstall -y apex && pip install --no-cache-dir https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/csrc/apex/torch2.8.0-cuda12x/apex-0.1-cp312-cp312-linux_x86_64.whl \ No newline at end of file diff --git a/docs/en/index.rst b/docs/en/index.rst index c9c2b71a..ed4dc7bc 100644 --- a/docs/en/index.rst +++ b/docs/en/index.rst @@ -24,7 +24,6 @@ ChatLearn Documentation tutorial/tutorial_grpo_fsdp_qwenvl tutorial/tutorial_grpo_mcore_qwenvl tutorial/tutorial_grpo_fsdp_sglang_agent - tutorial/tutorial_grpo_fsdp_qwen3vl tutorial/multinode_train tutorial/continue_train tutorial/tuning_guide diff --git a/docs/en/tutorial/tutorial_grpo_fsdp_qwen3vl.md b/docs/en/tutorial/tutorial_grpo_fsdp_qwen3vl.md deleted file mode 100644 index 18c5680d..00000000 --- a/docs/en/tutorial/tutorial_grpo_fsdp_qwen3vl.md +++ /dev/null @@ -1,71 +0,0 @@ -# Qwen3-VL End-to-End GRPO Training Tutorial with FSDP - -This document provides instructions for end-to-end training using the ChatLearn, pytorch FSDP and vLLM framework, and the qwen3vl-8b model. - -## Environment Setup -1. Docker Image Preparation - -We recommend running the following example in PAI [DSW](https://help.aliyun.com/zh/pai/user-guide/create-and-manage-dsw-instances/)/[DLC]( https://help.aliyun.com/zh/pai/user-guide/create-a-training-task). You need to use the following image to launch the instance. -```bash -dsw-registry.cn-shanghai.cr.aliyuncs.com/pai-training-algorithm/chatlearn:torch2.8.0-sglang0.5.3-ubuntu24.04-cuda12.6-py312 -``` - -You can use a VPC address to accelerate image pulling. The image address should be adjusted based on the current region. For example, if you need to launch a DSW instance in Shanghai, you can use the following image `dsw-registry-vpc.cn-shanghai.cr.aliyuncs.com/pai-training-algorithm/chatlearn:torch2.8.0-sglang0.5.3-ubuntu24.04-cuda12.6-py312`. - -2. Code Preparation - -```bash -git clone https://github.com/alibaba/ChatLearn.git && cd ChatLearn -``` - -## Data Preparation -We take [geo3k](https://hf-mirror.com/datasets/hiyouga/geometry3k) as exmaple. -```bash -# download dataset -mkdir -p dataset -export HF_ENDPOINT=https://hf-mirror.com - -# data process -python chatlearn/data/data_preprocess/geo3k.py -``` - -## Training -You can run the following command to start training: - -### Qwen3VL-8B -Run this command on server with 8 GPUs -MOE model is also supported -```bash -# download model weight -modelscope download --model Qwen/Qwen3-VL-8B-Instruct --local_dir pretrained_models/Qwen3-VL-8B-Instruct - -bash scripts/fsdp_sglang/train_fsdp_sglang_qwen3_vl_8b_grpo.sh -``` - -## Using Wandb -If you want to use Wandb to log the training process, you need to modify the configuration with: -```bash -export WANDB_API_KEY="Your-Wandb-api-key" -``` -Change the configuration to: -```bash -runtime_args.log_args_dict.enable_wandb=True -runtime_args.log_args_dict.wandb_project="Your-Wandb-Project-Name" -``` - -## Model Conversion -Saving FSDP models is time-consuming. Chatlearn provides an offline model conversion feature, which converts FSDP-sharded checkpoints back to HuggingFace format. The script is as follows: -```bash -export CHATLEARN=$(pwd) -python chatlearn/offline_ckpt_converter.py \ - --hf_dir ${CHATLEARN}/Qwen3-VL-8B-Instruct/ \ - --ckpt_dir ${CHATLEARN}/output/qwen3vl-grpo-8b/save_model/policy_trainer \ - --save_dir ${CHATLEARN}/output/qwen3vl-grpo-8b/save_model/huggingface/ \ - --iter 200 \ - --groupgemm 0 -``` -If you are training an MoE model with groupgemm, please make sure to set: -```bash - --groupgemm 1 -``` -This script will convert the final FSDP sharded model after training back into a HuggingFace model and save it in the path "${CHATLEARN}/output/qwen3vl-grpo-8b/save_model/huggingface/". \ No newline at end of file diff --git a/docs/zh/index.rst b/docs/zh/index.rst index 793d4ace..ec66eb5e 100644 --- a/docs/zh/index.rst +++ b/docs/zh/index.rst @@ -24,7 +24,6 @@ ChatLearn 使用文档 tutorial/tutorial_grpo_mcore tutorial/tutorial_grpo_fsdp_qwenvl tutorial/tutorial_grpo_mcore_qwenvl - tutorial/tutorial_grpo_fsdp_qwen3vl tutorial/tutorial_grpo_fsdp_sglang_agent tutorial/multinode_train tutorial/continue_train diff --git a/docs/zh/installation.md b/docs/zh/installation.md index befa2ac5..409b6d00 100644 --- a/docs/zh/installation.md +++ b/docs/zh/installation.md @@ -13,10 +13,10 @@ dsw-registry.cn-shanghai.cr.aliyuncs.com/pai-training-algorithm/chatlearn:torch2 ### SGLang -可以参考 [Dockerfile.torch2.8.0.sglang052](https://github.com/alibaba/ChatLearn/blob/main/docker/torch/Dockerfile.torch2.8.0.sglang052) 准备镜像。也可以直接拉取如下镜像地址直接进行使用。 +可以参考 [Dockerfile.torch2.8.0.sglang053](https://github.com/alibaba/ChatLearn/blob/main/docker/torch/Dockerfile.torch2.8.0.sglang053) 准备镜像。也可以直接拉取如下镜像地址直接进行使用。 ```bash -dsw-registry.cn-shanghai.cr.aliyuncs.com/pai-training-algorithm/chatlearn:torch2.8.0-sglang0.5.2-ubuntu24.04-cuda12.6-py312 +dsw-registry.cn-shanghai.cr.aliyuncs.com/pai-training-algorithm/chatlearn:torch2.8.0-sglang0.5.3-ubuntu24.04-cuda12.6-py312 ``` ## 2. 代码准备 diff --git a/docs/zh/tutorial/tutorial_grpo_fsdp_qwen3vl.md b/docs/zh/tutorial/tutorial_grpo_fsdp_qwen3vl.md deleted file mode 100644 index f3acc2ee..00000000 --- a/docs/zh/tutorial/tutorial_grpo_fsdp_qwen3vl.md +++ /dev/null @@ -1,73 +0,0 @@ -# 基于 FSDP 的端到端 Qwen35VL GRPO训练流程 - -本文档提供使用 ChatLearn、PyTorch FSDP 和 vLLM 框架来对Qwen3-VL模型进行GRPO训练的快速开始指南。 - -## 环境配置 -1. Docker镜像准备 -我们建议在PAI [DSW](https://help.aliyun.com/zh/pai/user-guide/create-and-manage-dsw-instances/)/[DLC](https://help.aliyun.com/zh/pai/user-guide/create-a-training-task)中运行该示例,你需要填写如下镜像地址来启动实例: -```bash -dsw-registry.cn-shanghai.cr.aliyuncs.com/pai-training-algorithm/chatlearn:torch2.8.0-sglang0.5.3-ubuntu24.04-cuda12.6-py312 -``` - -可以使用vpc地址来加速镜像拉取速度,需要根据当前region信息来更改镜像地址。比如,启动在上海的DSW实例,可以使用如下镜像`dsw-registry-vpc.cn-shanghai.cr.aliyuncs.com/pai-training-algorithm/chatlearn:torch2.8.0-sglang0.5.3-ubuntu24.04-cuda12.6-py312`。 - -2. 代码准备 - -```bash -git clone https://github.com/alibaba/ChatLearn.git && cd ChatLearn -``` - -## 数据准备 - -以[geo3k](https://hf-mirror.com/datasets/hiyouga/geometry3k)数据集作为示例. -```bash -# 下载数据集 -mkdir -p dataset - -export HF_ENDPOINT=https://hf-mirror.com - -# 数据集预处理 -python chatlearn/data/data_preprocess/geo3k.py -``` - -## 训练 -运行以下命令开始训练: - -### Qwen3VL-8B -8卡机器运行如下命令 -MOE 模型也同样支持 -```bash -# 下载模型权重 -modelscope download --model Qwen/Qwen3-VL-8B-Instruct --local_dir pretrained_models/Qwen3-VL-8B-Instruct - -bash scripts/fsdp_sglang/train_fsdp_sglang_qwen3_vl_8b_grpo.sh -``` - -## 使用 Wandb 监控 -如需使用 Wandb 记录训练过程,请修改对应脚本中的配置: - -```bash -export WANDB_API_KEY="Your-Wandb-api-key" -``` -将配置项改为: -```bash -runtime_args.log_args_dict.enable_wandb=True -runtime_args.log_args_dict.wandb_project="Your-Wandb-Project-Name" -``` - -## 模型转化 -FSDP模型保存耗时较高,Chatlearn提供了离线模型转化功能,将FSDP保存的切片模型转化回huggingface模型。脚本如下: -```bash -export CHATLEARN=$(pwd) -python chatlearn/offline_ckpt_converter.py \ - --hf_dir ${CHATLEARN}/Qwen3-VL-8B-Instruct/ \ - --ckpt_dir ${CHATLEARN}/output/qwen3vl-grpo-8b/save_model/policy_trainer \ - --save_dir ${CHATLEARN}/output/qwen3vl-grpo-8b/save_model/huggingface/ \ - --iter 200 \ - --groupgemm 0 -``` -如果你使用groupgemm优化的moe模型训练,请确保设置: -```bash - --groupgemm 1 -``` -这段脚本会将训练完成后的最后一个FSDP切片模型转化回HF模型,并保存在"${CHATLEARN}/output/qwen3vl-grpo-8b/save_model/huggingface/"路径下 diff --git a/docs/zh/tutorial/tutorial_grpo_fsdp_qwenvl.md b/docs/zh/tutorial/tutorial_grpo_fsdp_qwenvl.md index 5e659bf3..e570a071 100644 --- a/docs/zh/tutorial/tutorial_grpo_fsdp_qwenvl.md +++ b/docs/zh/tutorial/tutorial_grpo_fsdp_qwenvl.md @@ -1,6 +1,6 @@ -# 基于 FSDP 的端到端 Qwen2.5VL GRPO训练流程 +# 基于 FSDP 的端到端 VLM模型 GRPO训练流程 -本文档提供使用 ChatLearn、PyTorch FSDP 和 vLLM 框架来对Qwen2.5-VL模型进行GRPO训练的快速开始指南。 +本文档提供使用 ChatLearn、PyTorch FSDP 和 vLLM 框架来对VLM模型进行GRPO训练的快速开始指南。 ## 环境配置 1. Docker镜像准备 @@ -11,6 +11,7 @@ dsw-registry.cn-shanghai.cr.aliyuncs.com/pai-training-algorithm/chatlearn:torch2 可以使用vpc地址来加速镜像拉取速度,需要根据当前region信息来更改镜像地址。比如,启动在上海的DSW实例,可以使用如下镜像`dsw-registry-vpc.cn-shanghai.cr.aliyuncs.com/pai-training-algorithm/chatlearn:torch2.6.0-vllm0.8.5-ubuntu24.04-cuda12.6-py312`。 + 2. 代码准备 ```bash From 7cadaf2148248494e6671cbce8badda743a91bdf Mon Sep 17 00:00:00 2001 From: zouxinyi0625 Date: Fri, 17 Oct 2025 14:02:19 +0800 Subject: [PATCH 15/17] update --- docs/en/installation.md | 12 ++++++++++-- docs/zh/installation.md | 9 +++++++++ docs/zh/tutorial/tutorial_grpo_fsdp_qwenvl.md | 5 ++--- 3 files changed, 21 insertions(+), 5 deletions(-) diff --git a/docs/en/installation.md b/docs/en/installation.md index 8c8aa9a0..81ef42ca 100644 --- a/docs/en/installation.md +++ b/docs/en/installation.md @@ -14,11 +14,19 @@ dsw-registry.cn-shanghai.cr.aliyuncs.com/pai-training-algorithm/chatlearn:torch2 ### SGLang -You can prepare the image by referring to [Dockerfile.torch2.8.0.sglang052](https://github.com/alibaba/ChatLearn/blob/main/docker/torch/Dockerfile.torch2.8.0.sglang052). Alternatively, you can directly pull and use the following image: +You can prepare the image by referring to [Dockerfile.torch2.8.0.sglang052](https://github.com/alibaba/ChatLearn/blob/main/docker/torch/Dockerfile.torch2.8.0.sglang053). Alternatively, you can directly pull and use the following image: ```bash -dsw-registry.cn-shanghai.cr.aliyuncs.com/pai-training-algorithm/chatlearn:torch2.8.0-sglang0.5.2-ubuntu24.04-cuda12.6-py312 +dsw-registry.cn-shanghai.cr.aliyuncs.com/pai-training-algorithm/chatlearn:torch2.8.0-sglang0.5.3-ubuntu24.04-cuda12.6-py312 ``` +### Image History + +| Image URL | Pkg Version | Model List | +| ------------------------------------------------------------ | ----------------------------------------- | ------------------------------------------ | +| dsw-registry.cn-shanghai.cr.aliyuncs.com/pai-training-algorithm/chatlearn:torch2.8.0-sglang0.5.3-ubuntu24.04-cuda12.6-py312 | sglang 0.5.3.post1
transformers 4.57.0 | Qwen3-VL
Qwen2.5-VL
Qwen3
Qwen2.5 | +| dsw-registry.cn-shanghai.cr.aliyuncs.com/pai-training-algorithm/chatlearn:torch2.8.0-sglang0.5.3-ubuntu24.04-cuda12.6-py312 | sglang 0.5.2
transformers 4.56.1 | Qwen2.5-VL
Qwen3
Qwen2.5 | +| dsw-registry.cn-shanghai.cr.aliyuncs.com/pai-training-algorithm/chatlearn:torch2.6.0-vllm0.8.5-te2.7-ubuntu24.04-cuda12.6-py312 | vllm 0.8.5
transformer_engine 2.7 | Moonlight
Deepseek-r1 | +| dsw-registry.cn-shanghai.cr.aliyuncs.com/pai-training-algorithm/chatlearn:torch2.6.0-vllm0.8.5-ubuntu24.04-cuda12.6-py312 | vllm 0.8.5
transformers 4.51.3 | Qwen2.5-VL
Qwen3
Qwen2.5 | ## 2. Code Preparation diff --git a/docs/zh/installation.md b/docs/zh/installation.md index 409b6d00..5aa8db78 100644 --- a/docs/zh/installation.md +++ b/docs/zh/installation.md @@ -19,6 +19,15 @@ dsw-registry.cn-shanghai.cr.aliyuncs.com/pai-training-algorithm/chatlearn:torch2 dsw-registry.cn-shanghai.cr.aliyuncs.com/pai-training-algorithm/chatlearn:torch2.8.0-sglang0.5.3-ubuntu24.04-cuda12.6-py312 ``` +### 镜像历史 + +| 镜像地址 | 包版本 | 模型列表 | +| ------------------------------------------------------------ | ----------------------------------------- | ------------------------------------------ | +| dsw-registry.cn-shanghai.cr.aliyuncs.com/pai-training-algorithm/chatlearn:torch2.8.0-sglang0.5.3-ubuntu24.04-cuda12.6-py312 | sglang 0.5.3.post1
transformers 4.57.0 | Qwen3-VL
Qwen2.5-VL
Qwen3
Qwen2.5 | +| dsw-registry.cn-shanghai.cr.aliyuncs.com/pai-training-algorithm/chatlearn:torch2.8.0-sglang0.5.3-ubuntu24.04-cuda12.6-py312 | sglang 0.5.2
transformers 4.56.1 | Qwen2.5-VL
Qwen3
Qwen2.5 | +| dsw-registry.cn-shanghai.cr.aliyuncs.com/pai-training-algorithm/chatlearn:torch2.6.0-vllm0.8.5-te2.7-ubuntu24.04-cuda12.6-py312 | vllm 0.8.5
transformer_engine 2.7 | Moonlight
Deepseek-r1 | +| dsw-registry.cn-shanghai.cr.aliyuncs.com/pai-training-algorithm/chatlearn:torch2.6.0-vllm0.8.5-ubuntu24.04-cuda12.6-py312 | vllm 0.8.5
transformers 4.51.3 | Qwen2.5-VL
Qwen3
Qwen2.5 | + ## 2. 代码准备 ``` diff --git a/docs/zh/tutorial/tutorial_grpo_fsdp_qwenvl.md b/docs/zh/tutorial/tutorial_grpo_fsdp_qwenvl.md index e570a071..5e659bf3 100644 --- a/docs/zh/tutorial/tutorial_grpo_fsdp_qwenvl.md +++ b/docs/zh/tutorial/tutorial_grpo_fsdp_qwenvl.md @@ -1,6 +1,6 @@ -# 基于 FSDP 的端到端 VLM模型 GRPO训练流程 +# 基于 FSDP 的端到端 Qwen2.5VL GRPO训练流程 -本文档提供使用 ChatLearn、PyTorch FSDP 和 vLLM 框架来对VLM模型进行GRPO训练的快速开始指南。 +本文档提供使用 ChatLearn、PyTorch FSDP 和 vLLM 框架来对Qwen2.5-VL模型进行GRPO训练的快速开始指南。 ## 环境配置 1. Docker镜像准备 @@ -11,7 +11,6 @@ dsw-registry.cn-shanghai.cr.aliyuncs.com/pai-training-algorithm/chatlearn:torch2 可以使用vpc地址来加速镜像拉取速度,需要根据当前region信息来更改镜像地址。比如,启动在上海的DSW实例,可以使用如下镜像`dsw-registry-vpc.cn-shanghai.cr.aliyuncs.com/pai-training-algorithm/chatlearn:torch2.6.0-vllm0.8.5-ubuntu24.04-cuda12.6-py312`。 - 2. 代码准备 ```bash From 8248c2ab0f8028c58a8d590273e2d316f796768a Mon Sep 17 00:00:00 2001 From: zouxinyi0625 Date: Fri, 17 Oct 2025 15:56:42 +0800 Subject: [PATCH 16/17] update --- chatlearn/models/fsdp_module.py | 28 ++++++++--- .../train_fsdp_sglang_qwen3_vl_30b_grpo.sh | 50 +++++++++++++++++++ 2 files changed, 70 insertions(+), 8 deletions(-) create mode 100644 scripts/fsdp_sglang/train_fsdp_sglang_qwen3_vl_30b_grpo.sh diff --git a/chatlearn/models/fsdp_module.py b/chatlearn/models/fsdp_module.py index a9f8961d..04081a63 100644 --- a/chatlearn/models/fsdp_module.py +++ b/chatlearn/models/fsdp_module.py @@ -271,14 +271,26 @@ def create_model(self, model_path: str , torch_dtype: torch.dtype, meta_init: bo ) else: model_config = AutoConfig.from_pretrained(model_path) - assert "Qwen2_5_VLForConditionalGeneration" not in model_config.architectures, "VL model not support meta init" - with init_on_device('meta', include_buffers=False): - model = AutoModelForCausalLM.from_config( - model_config, - torch_dtype=torch_dtype, - attn_implementation="flash_attention_2", - trust_remote_code=self.module_args.trust_remote_code - ) + # assert "Qwen2_5_VLForConditionalGeneration" not in model_config.architectures, "VL model not support meta init" + if self.runtime_args.model_type == 'vlm': + with init_on_device('meta', include_buffers=False): + model = AutoModelForImageTextToText.from_pretrained( + pretrained_model_name_or_path=model_path, + torch_dtype=torch_dtype, + attn_implementation="flash_attention_2", + trust_remote_code=self.module_args.trust_remote_code + ) + + from chatlearn.models.patches.monkey_patch import apply_qwenvl + apply_qwenvl(model) + else: + with init_on_device('meta', include_buffers=False): + model = AutoModelForCausalLM.from_config( + model_config, + torch_dtype=torch_dtype, + attn_implementation="flash_attention_2", + trust_remote_code=self.module_args.trust_remote_code + ) dist.barrier() return model @property diff --git a/scripts/fsdp_sglang/train_fsdp_sglang_qwen3_vl_30b_grpo.sh b/scripts/fsdp_sglang/train_fsdp_sglang_qwen3_vl_30b_grpo.sh new file mode 100644 index 00000000..af60f23f --- /dev/null +++ b/scripts/fsdp_sglang/train_fsdp_sglang_qwen3_vl_30b_grpo.sh @@ -0,0 +1,50 @@ +#!/bin/bash + +# Tested on 8xH20-3e with 140G VRAM +set -x + +export CHATLEARN=$(pwd) +export PYTHONPATH=${CHATLEARN}:${PYTHONPATH} +source scripts/base_env.sh +export RAY_DEDUP_LOGS=1 +export exp_name=qwen3-vl-grpo-30b-sglang + +python chatlearn/entrypoint.py grpo \ + --config-file template/grpo_fsdp.yaml \ + runtime_args.exp_name=${exp_name} \ + runtime_args.rollout_backend=sglang \ + runtime_args.model_type=vlm \ + runtime_args.data_path=${CHATLEARN}/dataset/geo3k/train.parquet \ + runtime_args.eval_data_path=${CHATLEARN}/dataset/geo3k/test.parquet \ + runtime_args.output_dir=${CHATLEARN}/output/${exp_name} \ + runtime_args.num_episode=200 \ + runtime_args.sample_per_episode=512 \ + runtime_args.train_global_batch_size=512 \ + runtime_args.train_micro_batch_size=8 \ + runtime_args.save_episode_interval=5 \ + runtime_args.eval_episode_interval=5 \ + runtime_args.enable_eval_before_training=False \ + runtime_args.log_args_dict.enable_wandb=False \ + runtime_args.log_args_dict.wandb_project=your_wandb_project \ + models.policy_trainer.num_gpu=${num_device} \ + models.policy_trainer.packing=True \ + models.policy_trainer.meta_init=True \ + models.policy_trainer.groupgemm=True \ + models.policy_trainer.generation_batch_size=64 \ + models.policy_trainer.ulysses_sequence_parallel_size=1 \ + models.policy_trainer.load=${CHATLEARN}/pretrained_models/Qwen3-VL-30B-A3B-Instruct/ \ + models.policy_trainer.optimizer.lr=1e-6 \ + models.policy_trainer.pos_clip_ratio=0.2 \ + models.policy_trainer.neg_clip_ratio=0.2 \ + models.policy_trainer.kl_coef=0.01 \ + models.ref_policy.generation_batch_size=64 \ + models.policy.generation_batch_size=64 \ + models.policy.enforce_eager=False \ + models.policy.tensor_model_parallel_size=1 \ + models.policy.max_prompt_tokens_length=1024 \ + models.policy.max_response_tokens_length=2048 \ + models.policy.num_inference_per_prompt=4 \ + models.policy.gpu_memory_utilization=0.85 \ + models.policy.enable_thinking=False \ + models.reward.generation_batch_size=256 \ + 2>&1 | tee log_${exp_name}.log ; exit ${PIPESTATUS[0]} From 5a7d80a9a93c418108ca51cf7a0ed3ef597abfc0 Mon Sep 17 00:00:00 2001 From: zouxinyi0625 Date: Fri, 17 Oct 2025 16:01:47 +0800 Subject: [PATCH 17/17] update --- chatlearn/models/fsdp_module.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/chatlearn/models/fsdp_module.py b/chatlearn/models/fsdp_module.py index 04081a63..1ecdb313 100644 --- a/chatlearn/models/fsdp_module.py +++ b/chatlearn/models/fsdp_module.py @@ -271,7 +271,7 @@ def create_model(self, model_path: str , torch_dtype: torch.dtype, meta_init: bo ) else: model_config = AutoConfig.from_pretrained(model_path) - # assert "Qwen2_5_VLForConditionalGeneration" not in model_config.architectures, "VL model not support meta init" + if self.runtime_args.model_type == 'vlm': with init_on_device('meta', include_buffers=False): model = AutoModelForImageTextToText.from_pretrained(