diff --git a/chatlearn/algorithm/grpo_utils/megatron_utils/train_helper.py b/chatlearn/algorithm/grpo_utils/megatron_utils/train_helper.py
index f230b7ed..3f7e7b2e 100644
--- a/chatlearn/algorithm/grpo_utils/megatron_utils/train_helper.py
+++ b/chatlearn/algorithm/grpo_utils/megatron_utils/train_helper.py
@@ -393,7 +393,6 @@ def get_batch(
             {
                 "pixel_values": data_b['pixel_values'], # [token_length, token_num]
                 "image_grid_thw": data_b['image_grid_thw'], # [batch_size, 3]
-                "rope_deltas": data_b['rope_deltas'], # [batch_size, 1]
                 "image_input_mask": tokens==get_tokenizer().vocab['<|image_pad|>'] # [batch_size, token_length]
             }
         )
diff --git a/chatlearn/algorithm/grpo_utils/policy_trainer.py b/chatlearn/algorithm/grpo_utils/policy_trainer.py
index 8a69b45f..b5d17fd4 100644
--- a/chatlearn/algorithm/grpo_utils/policy_trainer.py
+++ b/chatlearn/algorithm/grpo_utils/policy_trainer.py
@@ -103,7 +103,6 @@ def preprocess_data_list(self, data_list: List[Dict[str, Any]], training: bool):
 
             # for vl
             position_ids = data_b.get("position_ids", None)
-            rope_deltas = data_b.get("rope_deltas", None)
             pixel_values = data_b.get("pixel_values", None)
             image_grid_thw = data_b.get("image_grid_thw", None)
 
@@ -151,7 +150,6 @@ def preprocess_data_list(self, data_list: List[Dict[str, Any]], training: bool):
                     {
                         "pixel_values": pixel_values, # [token_length, token_num]
                         "image_grid_thw": image_grid_thw, # [batch_size, 3]
-                        "rope_deltas": rope_deltas # [batch_size, 1]
                     }
                 )
 
@@ -196,8 +194,7 @@ def train_step(self, data_list: List[Dict[str, Any]], **kwargs): # pylint: disab
                     image_grid_thw=inputs['image_grid_thw'],
                     attention_mask=None,
                     position_ids=inputs['position_ids'],
-                    use_cache=False,
-                    rope_deltas=inputs['rope_deltas']
+                    use_cache=False
                 )
             else:
                 output = self.model(
@@ -315,8 +312,7 @@ def forward_step(self, data: List[Dict[str, Any]], **kwargs) -> List[Dict[str, A
                         image_grid_thw=inputs['image_grid_thw'],
                         attention_mask=None,
                         position_ids=inputs['position_ids'],
-                        use_cache=False,
-                        rope_deltas=inputs['rope_deltas']
+                        use_cache=False
                     )
                 else:
                     output = self.model(
diff --git a/chatlearn/algorithm/grpo_utils/trainer_utils.py b/chatlearn/algorithm/grpo_utils/trainer_utils.py
index 8923f307..08324f7a 100644
--- a/chatlearn/algorithm/grpo_utils/trainer_utils.py
+++ b/chatlearn/algorithm/grpo_utils/trainer_utils.py
@@ -139,7 +139,7 @@ def batching(data_list: List[Dict[str, Any]]) -> Dict[str, Any]:
     batched_data = defaultdict(list)
     for key in data_list[0]:
         batched_data[key] = [data[key] for data in data_list]
-        if key in ['pixel_values', 'image_grid_thw', 'rope_deltas']:
+        if key in ['pixel_values', 'image_grid_thw']:
             batched_data[key] = torch.cat(batched_data[key], dim=0)
         elif isinstance(batched_data[key][0], torch.Tensor):
             batched_data[key] = padding_tensor(batched_data[key])
diff --git a/chatlearn/data/data_preprocess/geo3k.py b/chatlearn/data/data_preprocess/geo3k.py
index dd92fe69..5e03404c 100644
--- a/chatlearn/data/data_preprocess/geo3k.py
+++ b/chatlearn/data/data_preprocess/geo3k.py
@@ -3,9 +3,34 @@
 """
 
 import argparse
+import base64
 import os
+from typing import List, Dict
+from io import BytesIO
 
 import datasets
+from PIL import Image
+
+def image_to_base64(img: Image.Image) -> str:
+
+    img = img.convert("RGB")
+    buffered = BytesIO()
+    img.save(buffered, format="JPEG", quality=100)
+    img_str = base64.b64encode(buffered.getvalue()).decode('utf-8')
+    return img_str
+
+def prepare_image_content(img_list: List[Image.Image]) -> List[Dict]:
+    img_content = []
+
+    for img in img_list:
+        img_content.append(
+            {
+                "type": "image_url",
+                "image_url": f"data:image;base64,{image_to_base64(img)}"
+            }
+        )
+        assert img_content[-1]["image_url"] is not None
+    return img_content
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
@@ -32,18 +57,24 @@ def make_map_fn(split):
         def process_fn(example, idx):
             problem = example.pop("problem")
             prompt = problem + " " + instruction_following
+            prompt = prompt.replace("<image>", "")
             answer = example.pop("answer")
             images = example.pop("images")
+            # format openai style messages
+            messages = [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": prompt}
+                    ]
+                }
+            ]
+            messages[0]["content"] = prepare_image_content(images)+ \
+                messages[0]["content"]
 
             data = {
                 "data_source": data_source,
-                "prompt": [
-                    {
-                        "role": "user",
-                        "content": prompt,
-                    }
-                ],
-                "images": images,
+                "messages": messages,
                 "ability": "math",
                 "reward_model": {"style": "rule", "ground_truth": answer},
                 "extra_info": {
@@ -63,5 +94,6 @@ def process_fn(example, idx):
     local_dir = args.local_dir
     hdfs_dir = args.hdfs_dir
 
+    # to_parquet may produce key: None in dict
     train_dataset.to_parquet(os.path.join(local_dir, "train.parquet"))
     test_dataset.to_parquet(os.path.join(local_dir, "test.parquet"))
diff --git a/chatlearn/data/data_preprocess/geo3k_agent.py b/chatlearn/data/data_preprocess/geo3k_agent.py
new file mode 100644
index 00000000..9adc0cbf
--- /dev/null
+++ b/chatlearn/data/data_preprocess/geo3k_agent.py
@@ -0,0 +1,109 @@
+"""
+Preprocess the Geometry3k dataset to parquet format
+"""
+
+import argparse
+import base64
+import os
+from typing import List, Dict
+from io import BytesIO
+
+import datasets
+from PIL import Image
+
+def image_to_base64(img: Image.Image) -> str:
+
+    img = img.convert("RGB")
+    buffered = BytesIO()
+    img.save(buffered, format="JPEG", quality=100)
+    img_str = base64.b64encode(buffered.getvalue()).decode('utf-8')
+    return img_str
+
+def prepare_image_content(img_list: List[Image.Image]) -> List[Dict]:
+    img_content = []
+
+    for img in img_list:
+        img_content.append(
+            {
+                "type": "image_url",
+                "image_url": f"data:image;base64,{image_to_base64(img)}"
+            }
+        )
+        assert img_content[-1]["image_url"] is not None
+    return img_content
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--local_dir", default="dataset/geo3k")
+    parser.add_argument("--hdfs_dir", default=None)
+
+    args = parser.parse_args()
+
+    data_source = "hiyouga/geometry3k"
+
+    dataset = datasets.load_dataset(data_source)
+
+    train_dataset = dataset["train"]
+    test_dataset = dataset["test"]
+
+    instruction_following = (
+        r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. "
+        r"The reasoning process MUST BE enclosed within <think> </think> tags. "
+        r"You must use the `calc_geo3k_reward` tool after step by step solving the question"
+        r"The final answer MUST BE put in \boxed{}."
+    )
+
+    system_prompt = (
+                    "You are a math expert. You are given a question and you need to solve it step by step. "
+                    "Reasoning step by step before any tool call. "
+                    "You should use the `calc_geo3k_reward` tool after step by step solving the question, "
+                    "before generate final answer at least once and refine your answer if necessary. "
+                    "Put your final answer within \\boxed{}."
+                )
+    # add a row to each data item that represents a unique id
+    def make_map_fn(split):
+        def process_fn(example, idx):
+            problem = example.pop("problem")
+            prompt = problem + " " + instruction_following
+            prompt = prompt.replace("<image>", "")
+            answer = example.pop("answer")
+            images = example.pop("images")
+            # format openai style messages
+            messages = [
+                {"role": "system", "content": [{"type": "text", "text": system_prompt}]},
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": prompt}
+                    ]
+                }
+            ]
+            messages[-1]["content"] = prepare_image_content(images)+ \
+                messages[-1]["content"]
+            data = {
+                "agent_name": "geo3k_agent",
+                "agent_cfg_path": "template/agent/geo3k_eval.yaml",
+                "data_source": data_source,
+                "messages": messages,
+                "ability": "math",
+                "reward_model": {"style": "rule", "ground_truth": answer},
+                "extra_info": {
+                    "split": split,
+                    "index": idx,
+                    "answer": answer,
+                    "question": problem,
+                },
+            }
+            return data
+
+        return process_fn
+
+    train_dataset = train_dataset.map(function=make_map_fn("train"), with_indices=True, num_proc=8)
+    test_dataset = test_dataset.map(function=make_map_fn("test"), with_indices=True, num_proc=8)
+
+    local_dir = args.local_dir
+    hdfs_dir = args.hdfs_dir
+
+    # to_parquet may produce key: None in dict
+    train_dataset.to_parquet(os.path.join(local_dir, "train_agent.parquet"))
+    test_dataset.to_parquet(os.path.join(local_dir, "test_agent.parquet"))
diff --git a/chatlearn/data/vl_prompt_dataset.py b/chatlearn/data/vl_prompt_dataset.py
index df73b933..0d4e69b9 100644
--- a/chatlearn/data/vl_prompt_dataset.py
+++ b/chatlearn/data/vl_prompt_dataset.py
@@ -1,10 +1,11 @@
 """prompt dataset"""
 
 from typing import List, Dict
-import re
+
 from torch.utils.data import Dataset
 from transformers import AutoTokenizer, AutoProcessor
-from chatlearn.data.vision_utils import process_image, process_video
+from qwen_vl_utils import process_vision_info
+
 from chatlearn.models.patches.transformers.qwen2_5_vl_patch import get_rope_index
 
 
@@ -13,11 +14,7 @@ class PromptPipeline(Dataset):
     Input data_list: List[Dict])
     {
         "data_source": data_source,
-        "images": [PIL.Image]
-        "prompt": [{
-            "role": "user",
-            "content": question,
-        }],
+        "messages": openai-style messages List,
         "ability": "math",
         "reward_model": {
             "style": "rule",
@@ -38,12 +35,11 @@ class PromptPipeline(Dataset):
         "prompt_token_length": int, # len(input_ids)
         "prompt": String,
         "position_ids": List[List], # [3, token_length]
-        "rope_deltas": Tensor, # [1,1]
         "data_source": String,
         "ground_truth": String,
         "multi_modal_data": {'image':[PIL.Image]}, # for vllm inference
         "mm_processor_kwargs": {'fps':[]}, # used for video useless now
-        "pixel_values": Tensor, # [token_num, token_length]
+        "pixel_values": Tensor, # [grid_num, pixel_num]
         "image_grid_thw": Tensor, # [1,3] 3 means t,h,w
     }
     """
@@ -53,115 +49,92 @@ def __init__(
         max_prompt_tokens_length: int,
         tokenizer: AutoTokenizer = None,
         processor: AutoProcessor = None,
-        enable_thinking=False
+        enable_thinking=False,
+        raw_chat=False
     ):  # pylint: disable=super-init-not-called
         super().__init__()
 
         self.tokenizer = tokenizer
         self.processor = processor
 
-        # TODO default key for input_data
-        self.prompt_key = "prompt"
-        self.image_key = "images"
-        self.video_key = "videos"
         self.data = []
         self.max_prompt = 0
 
         for data_item in data_list:
-            messages = self._build_messages(data_item)
-
-            model_inputs = {}
-
-            assert self.processor is not None
-
-            raw_prompt = self.processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, enable_thinking=enable_thinking)
-            # multi_modal_data = {}
-
-            images = None
-            if self.image_key in data_item and data_item.get(self.image_key, None) is not None:
-                images = [process_image(image) for image in data_item.pop(self.image_key)]
-
-                # due to the image key is "image" instead of "images" in vllm, we need to use "image" here
-                # link: https://github.com/vllm-project/vllm/blob/3c545c0c3b98ee642373a308197d750d0e449403/vllm/multimodal/parse.py#L205
-                # multi_modal_data["image"] = images
-
-            videos = None
-            if self.video_key in data_item and data_item.get(self.video_key, None) is not None:
-                videos = [process_video(video) for video in data_item.pop(self.video_key)]
-
-                # due to the video key is "video" instead of "videos" in vllm, we need to use "video" here
-                # link: https://github.com/vllm-project/vllm/blob/3c545c0c3b98ee642373a308197d750d0e449403/vllm/multimodal/parse.py#L205
-                # multi_modal_data["video"] = [video.numpy() for video in videos]
-
-            # TODO support video. Only images are supported now.
-            multi_modal_data = {'image':images}
-            mm_processor_kwargs = {'fps': []}
-
-            model_inputs = self.processor(text=[raw_prompt], images=images, videos=videos, return_tensors="pt")
-            input_ids = model_inputs.pop("input_ids")
-            attention_mask = model_inputs.pop("attention_mask")
-            image_grid_thw = model_inputs.get("image_grid_thw")
-            pixel_values = model_inputs.get("pixel_values")
-
-            # text only input_ids for vllm
-            raw_input_ids = self.tokenizer.encode(raw_prompt, add_special_tokens=False)
-
-            position_ids, rope_deltas = get_rope_index(
-                self.processor,
-                input_ids=input_ids,
-                image_grid_thw=model_inputs.get("image_grid_thw"),
-                video_grid_thw=model_inputs.get("video_grid_thw"),
-                second_per_grid_ts=model_inputs.get("second_per_grid_ts"),
-                attention_mask=attention_mask,
-            )
-
+            messages = data_item.get("messages")
             data_source = data_item.get("data_source", "")
             ground_truth = data_item["reward_model"]["ground_truth"]
-
-            # for vl model, raw_input_ids is only text input_ids for vllm inference
-            # input_ids is used for model forward_step and sglang inference (with image pad)
-            # sglang support both input_ids and raw_input_ids but to merge in all_tokens, input_ids is used
+            agent_name = data_item.get("agent_name", None)
+            agent_cfg_path = data_item.get("agent_cfg_path", None)
             processed_data = {
-                "raw_input_ids": raw_input_ids,
-                "input_ids": input_ids[0].tolist(),
-                "prompt_token_length": len(input_ids[0].tolist()),
-                "prompt": raw_prompt,
-                "position_ids": position_ids.squeeze().tolist(),
-                "rope_deltas": rope_deltas,
                 "data_source": data_source,
                 "ground_truth": ground_truth,
-                "multi_modal_data": multi_modal_data,
-                "mm_processor_kwargs": mm_processor_kwargs,
-                "pixel_values": pixel_values,
-                "image_grid_thw": image_grid_thw
+                "agent_name": agent_name,
+                "agent_cfg_path": agent_cfg_path
             }
-
-            if len(input_ids[0]) > self.max_prompt:
-                self.max_prompt = len(input_ids[0])
-
-            if max_prompt_tokens_length > len(input_ids[0]):
+            for message in messages:
+                message['content'] = [
+                {k: v for k, v in item.items() if v is not None}
+                for item in message['content']
+            ]
+            if not raw_chat:
+
+                model_inputs = {}
+
+                assert self.processor is not None
+
+                raw_prompt = self.processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, enable_thinking=enable_thinking)
+                images, videos = process_vision_info(messages)
+
+                # TODO support video. Only images are supported now.
+                multi_modal_data = {'image':images}
+                mm_processor_kwargs = {'fps': []}
+
+                model_inputs = self.processor(text=[raw_prompt], images=images, videos=videos, return_tensors="pt")
+                input_ids = model_inputs.pop("input_ids")
+                attention_mask = model_inputs.pop("attention_mask")
+                image_grid_thw = model_inputs.get("image_grid_thw")
+                pixel_values = model_inputs.get("pixel_values")
+
+                # text only input_ids for vllm
+                raw_input_ids = self.tokenizer.encode(raw_prompt, add_special_tokens=False)
+                # get position_ids used for sequence packing
+                position_ids, _ = get_rope_index(
+                    self.processor,
+                    input_ids=input_ids,
+                    image_grid_thw=model_inputs.get("image_grid_thw"),
+                    video_grid_thw=model_inputs.get("video_grid_thw"),
+                    second_per_grid_ts=model_inputs.get("second_per_grid_ts"),
+                    attention_mask=attention_mask,
+                )
+
+                # for vl model, raw_input_ids is only text input_ids for vllm inference
+                # input_ids is used for model forward_step and sglang inference (with image pad)
+                # sglang support both input_ids and raw_input_ids but to merge in all_tokens, input_ids is used
+                processed_data.update({
+                    "raw_input_ids": raw_input_ids,
+                    "input_ids": input_ids[0].tolist(),
+                    "prompt_token_length": len(input_ids[0].tolist()),
+                    "prompt": raw_prompt,
+                    "position_ids": position_ids.squeeze().tolist(),
+                    "multi_modal_data": multi_modal_data,
+                    "mm_processor_kwargs": mm_processor_kwargs,
+                    "pixel_values": pixel_values,
+                    "image_grid_thw": image_grid_thw
+                })
+                if len(input_ids[0]) > self.max_prompt:
+                    self.max_prompt = len(input_ids[0])
+
+                if max_prompt_tokens_length > len(input_ids[0]):
+                    self.data.append(processed_data)
+            else:
+                # used in agent module
+                processed_data.update(
+                    {"messages": messages}
+                )
                 self.data.append(processed_data)
-        self.valid_ratio = len(self.data) / len(data_list)
-    def _build_messages(self, example: dict):
-        messages: list = example.pop(self.prompt_key)
 
-        if self.image_key in example or self.video_key in example:
-            for message in messages:
-                content = message["content"]
-                content_list = []
-                segments = re.split("(<image>|<video>)", content)
-                segments = [item for item in segments if item != ""]
-                for segment in segments:
-                    if segment == "<image>":
-                        content_list.append({"type": "image"})
-                    elif segment == "<video>":
-                        content_list.append({"type": "video"})
-                    else:
-                        content_list.append({"type": "text", "text": segment})
-
-                message["content"] = content_list
-
-        return messages
+        self.valid_ratio = len(self.data) / len(data_list)
 
     def __getitem__(self, ix: int):
         return self.data[ix]
diff --git a/chatlearn/models/agent/agent_module.py b/chatlearn/models/agent/agent_module.py
index 4f31d81b..98aceda9 100644
--- a/chatlearn/models/agent/agent_module.py
+++ b/chatlearn/models/agent/agent_module.py
@@ -49,7 +49,12 @@ def build_agent_graph(self, agent_name: str, agent_cfg_path: str) -> BaseAgentGr
 
         cfg = OmegaConf.load(agent_cfg_path) if agent_cfg_path else None
         graph_instance = _graph_registry[agent_name](
-            agent_name=agent_name, cfg=cfg, llm=self.llm, tokenizer=self.tokenizer
+            agent_name=agent_name,
+            cfg=cfg,
+            llm=self.llm,
+            tokenizer = self.tokenizer,
+            processor=self.processor,
+            model_type=self.runtime_args.model_type
         )
         self.agent_factory[agent_name] = graph_instance
         return graph_instance
@@ -82,11 +87,15 @@ def postprocess_func(
         data_output = []
         for output, input_data in zip(batched_outputs, input_data_list):
             prompt_token_ids = output.prompt_ids
+            pixel_values = output.pixel_values
+            image_grid_thw = output.image_grid_thw
+            position_ids = output.position_ids
             response_token_length = len(output.all_token_ids) - len(output.prompt_ids)
             prompt_token_length = len(output.prompt_ids)
             str_outputs = output.str_output
             all_tokens = torch.tensor(output.all_token_ids)
             loss_mask = torch.tensor(output.loss_mask)
+
             input_data.update(
                 {
                     "loss_mask": loss_mask,
@@ -96,6 +105,10 @@ def postprocess_func(
                     "prompt_token_length": prompt_token_length,
                     "all_token_length": response_token_length + prompt_token_length,
                     "str_outputs": str_outputs,
+                    # multimodel related
+                    "pixel_values": pixel_values,
+                    "image_grid_thw": image_grid_thw,
+                    "position_ids": position_ids
                 }
             )
             data_output.append(input_data)
diff --git a/chatlearn/models/agent/base_agent_graph.py b/chatlearn/models/agent/base_agent_graph.py
index c2d0daf4..f6532220 100644
--- a/chatlearn/models/agent/base_agent_graph.py
+++ b/chatlearn/models/agent/base_agent_graph.py
@@ -20,11 +20,13 @@
 from langgraph.graph import StateGraph
 from omegaconf import DictConfig
 from pydantic import BaseModel
-from transformers import AutoTokenizer
+from transformers import AutoTokenizer, AutoProcessor
 
 from chatlearn.models.agent.chat_model import (CustomChatModel,
-                                               find_last_ai_index)
+                                               find_last_ai_index,
+                                               find_first_ai_index)
 from chatlearn.models.sglang_module import AsyncEngine
+from chatlearn.models.patches.transformers.qwen2_5_vl_patch import get_rope_index
 
 
 def find_first_zero_group_end(lst):
@@ -37,22 +39,25 @@ def find_first_zero_group_end(lst):
 # modified from https://github.com/volcengine/verl/blob/main/verl/experimental/agent_loop/agent_loop.py#L121
 class AgentGraphOutput(BaseModel):
     """AgentGraphOutput"""
-    # TODO: support multi_modal
+    # total rollout string
     str_output: str
-    """total rollout str"""
+    # Prompt token ids
     prompt_ids: list[int]
-    """Prompt token ids."""
+    # all token ids including prompt, LLM generated token, tool response token.
     all_token_ids: list[int]
-    """all token ids including prompt, LLM generated token, tool response token."""
+    # loss mask, 1 for LLM generated token, 0 for tool response token, input prompt.
     loss_mask: list[int]
-    """loss mask, 1 for LLM generated token, 0 for tool response token, input prompt."""
     response_logprobs: Optional[list[float]] = None
+    # Reward score for the trajectory
     reward_score: Optional[float] = None
-    """Reward score for the trajectory."""
+    # Number of chat turns, including user, assistant, tool
     num_turns: int = 0
-    """Number of chat turns, including user, assistant, tool."""
+    # multimodel related item
+    pixel_values: Any = None
+    image_grid_thw: Any = None
+    position_ids: Any = None
+    # Extra fields for dynamic addition.
     extra_fields: dict[str, Any] = {}
-    """Extra fields for dynamic addition."""
 
 
 class BaseAgentGraph:
@@ -66,6 +71,8 @@ def __init__(
         cfg: DictConfig,
         llm: AsyncEngine,
         tokenizer: AutoTokenizer,
+        processor: AutoProcessor = None,
+        model_type: str = "llm",
         **kwargs
     ):
 
@@ -73,6 +80,8 @@ def __init__(
         self.cfg = cfg
         self.llm = llm
         self.tokenizer = tokenizer
+        self.processor = processor
+        self.model_type = model_type
 
     @abstractmethod
     async def run(self, sampling_params: dict[str, Any], **kwargs) -> AgentGraphOutput:
@@ -80,13 +89,17 @@ async def run(self, sampling_params: dict[str, Any], **kwargs) -> AgentGraphOutp
 
     def build_graph(self) -> StateGraph:
         self.chatmodel = CustomChatModel(
-            model=self.agent_name, llm=self.llm, tokenizer=self.tokenizer
+            model=self.agent_name,
+            llm=self.llm,
+            tokenizer=self.tokenizer,
+            processor=self.processor,
+            model_type=self.model_type
         )
 
     def convert_agent_graph_output(self, messages: Dict) -> AgentGraphOutput:
         messages = messages["messages"]
         last_ai_message_idx = find_last_ai_index(messages)
-
+        first_ai_message_idx = find_first_ai_index(messages)
         # discard messages after last ai message
         all_token_ids = messages[last_ai_message_idx].response_metadata["token_ids"]
         loss_mask = messages[last_ai_message_idx].response_metadata["loss_mask"]
@@ -94,10 +107,30 @@ def convert_agent_graph_output(self, messages: Dict) -> AgentGraphOutput:
         prompt_ids = all_token_ids[: prompt_end_idx + 1]
         num_turns = last_ai_message_idx + 1
         str_output = self.tokenizer.decode(all_token_ids[prompt_end_idx + 1 :])
+
+        pixel_values, image_grid_thw, position_ids = None, None, None
+        multimodel_batch_feature = messages[first_ai_message_idx].response_metadata.get("multimodel_batch_feature", None)
+        if multimodel_batch_feature:
+            pixel_values = multimodel_batch_feature.get("pixel_values")
+            image_grid_thw = multimodel_batch_feature.get("image_grid_thw")
+            # need to get position ids used in sequence packing
+            position_ids, _ = get_rope_index(
+                self.processor,
+                input_ids=multimodel_batch_feature.get("input_ids"),
+                image_grid_thw=multimodel_batch_feature.get("image_grid_thw"),
+                video_grid_thw=multimodel_batch_feature.get("video_grid_thw"),
+                second_per_grid_ts=multimodel_batch_feature.get("second_per_grid_ts"),
+                attention_mask=multimodel_batch_feature.get("attention_mask"),
+            )
+            position_ids = position_ids.squeeze().tolist()
+
         return AgentGraphOutput(
             str_output=str_output,
             prompt_ids=prompt_ids,
             all_token_ids=all_token_ids,
             loss_mask=loss_mask,
             num_turns=num_turns,
+            pixel_values=pixel_values,
+            image_grid_thw=image_grid_thw,
+            position_ids=position_ids
         )
diff --git a/chatlearn/models/agent/chat_model.py b/chatlearn/models/agent/chat_model.py
index 6c8eb04d..4d64edfc 100644
--- a/chatlearn/models/agent/chat_model.py
+++ b/chatlearn/models/agent/chat_model.py
@@ -3,7 +3,7 @@
 import asyncio
 import json
 import uuid
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, Optional
 
 from langchain_core.language_models import BaseChatModel
 from langchain_core.language_models.base import LanguageModelInput
@@ -14,6 +14,8 @@
 from langchain_core.runnables import Runnable
 from langchain_core.utils.function_calling import convert_to_openai_tool
 from pydantic import Field
+from qwen_vl_utils import process_vision_info
+from transformers.feature_extraction_utils import BatchFeature
 
 from chatlearn.models.agent.tool_parser import ToolParser
 from chatlearn.utils.logger import logger
@@ -25,13 +27,20 @@ def find_last_ai_index(messages):
             return i
     return -1
 
+def find_first_ai_index(messages):
+    for idx, message in enumerate(messages):
+        if getattr(message, "type", None) == "ai":
+            return idx
+    return -1
 
 class CustomChatModel(BaseChatModel):
     """CustomChatModel for async sglang"""
     model_name: str = Field(alias="model")
     llm: Any
-    tokenizer: Any
 
+    tokenizer: Any
+    processor: Any
+    model_type: str = "llm"
     # used for tool call
     max_parallel_calls: int = 1
 
@@ -47,6 +56,7 @@ def bind_tools(self, tools, **kwargs) -> Runnable[LanguageModelInput, BaseMessag
         formatted_tools: list = [convert_to_openai_tool(tool) for tool in tools]
         for tool in formatted_tools:
             tool["function"]["parameters"]["properties"].pop("kwargs", None)
+
         # used to remove system prompt prefix when encoding tool response
         system_prompt = self.tokenizer.apply_chat_template(
             [{}], add_generation_prompt=False, tokenize=True
@@ -84,17 +94,23 @@ async def _agenerate(
         Returns:
             ChatResult: Chat result.
         """
-        token_ids, loss_mask = await self._preprocess(messages, **kwargs)
+        processed_data = await self._preprocess(messages, **kwargs)
+        image_data = processed_data.get("image_data", None)
+
+        first_ai_message_idx = find_first_ai_index(messages)
+        if first_ai_message_idx != -1:
+            image_data = messages[first_ai_message_idx].response_metadata.get("image_data", None)
+
         assert "sampling_params" in kwargs, "please pass sampling_params(Dict)"
         sampling_params = kwargs["sampling_params"]
         output: Dict = await self.llm.async_generate(
             prompt=None,
             sampling_params=sampling_params,
             return_logprob=False,
-            input_ids=token_ids,
+            input_ids=processed_data.get("token_ids"),
+            image_data = image_data
         )
-
-        message = await self._postprocess(output, token_ids, loss_mask)
+        message = await self._postprocess(output, processed_data)
         generation = ChatGeneration(message=message)
         return ChatResult(generations=[generation])
 
@@ -102,7 +118,6 @@ async def _preprocess(self, messages: list[BaseMessage], **kwargs: Any):
         """
         convert list[BaseMessage] to SGLangModule.generate input
         """
-
         # messages: [system], human, ai, human|tool, ai, human|tool, ...
         assert messages[-1].type in [
             "human",
@@ -110,24 +125,51 @@ async def _preprocess(self, messages: list[BaseMessage], **kwargs: Any):
         ], f"Last message must be human or tool, but got {messages[-1].type}"
         loop = asyncio.get_running_loop()
 
+        # get image and video data
+        # we assume multimodel data only appears in the first round
+        # so we only process multimodel data in first round
         # tokenizer prompt after last AI message
         last_ai_message_idx = find_last_ai_index(messages)
         # not find ai message, means first input
         if last_ai_message_idx == -1:
-            token_ids = await loop.run_in_executor(
+
+            openai_messages = convert_to_openai_messages(messages)
+            text = await loop.run_in_executor(
                 None,
                 lambda: self.tokenizer.apply_chat_template(
-                    convert_to_openai_messages(messages),
+                    openai_messages,
                     tools=kwargs.get("tools"),
                     add_generation_prompt=True,
                     enable_thinking=False,
-                    tokenize=True,
+                    tokenize=False,
                 ),
             )
-            return token_ids, [0] * len(token_ids)
+            if self.model_type == "llm":
+                token_ids = await loop.run_in_executor(
+                    None,
+                    lambda: self.tokenizer.encode(text)
+                )
+                return {"token_ids": token_ids, "loss_mask": [0] * len(token_ids)}
+            elif self.model_type == "vlm":
 
+                image_inputs, video_inputs = await loop.run_in_executor(
+                    None,
+                    lambda: process_vision_info(openai_messages)
+                )
+                # process multimodel data
+                multimodel_batch_feature: BatchFeature = await loop.run_in_executor(
+                    None,
+                    lambda: self.processor(text=text, images=image_inputs, videos=video_inputs, return_tensors="pt")
+                )
+                token_ids = multimodel_batch_feature.get("input_ids")[0].tolist()
+                loss_mask = [0] * len(token_ids)
+                return {"token_ids": token_ids,
+                        "loss_mask": [0] * len(token_ids),
+                        "multimodel_batch_feature": multimodel_batch_feature,
+                        "image_data": image_inputs,
+                        "video_data": video_inputs
+                        }
         # find ai message, only encode messages after last ai message
-
         else:
             remaining_messages = messages[last_ai_message_idx + 1 :]
             previous_token_ids = messages[last_ai_message_idx].response_metadata[
@@ -146,14 +188,15 @@ async def _preprocess(self, messages: list[BaseMessage], **kwargs: Any):
                     tokenize=True,
                 ),
             )
-
             remaining_token_ids = remaining_token_ids[len(kwargs["system_prompt"]) :]
             token_ids = previous_token_ids + remaining_token_ids
             loss_mask = previous_loss_mask + [0] * len(remaining_token_ids)
-            return token_ids, loss_mask
+            return {"token_ids": token_ids, "loss_mask": loss_mask}
 
-    async def _postprocess(self, output: Dict, token_ids: List, loss_mask: List):
+    async def _postprocess(self, output: Dict, processed_data: Dict):
         """convert sglang output to LangGraph AIMessage"""
+        token_ids = processed_data["token_ids"]
+        loss_mask = processed_data["loss_mask"]
         output_ids = output["output_ids"]
         completion_tokens = output["meta_info"]["completion_tokens"]
         # 198, 151667, 271, 151668, 271
@@ -203,6 +246,9 @@ async def _postprocess(self, output: Dict, token_ids: List, loss_mask: List):
             response_metadata={
                 "token_ids": token_ids,
                 "loss_mask": loss_mask,
+                "multimodel_batch_feature": processed_data.get("multimodel_batch_feature", None),
+                "image_data": processed_data.get("image_data", None),
+                "video_data": processed_data.get("video_data", None)
             },
         )
         return message
diff --git a/chatlearn/models/agent/examples/__init__.py b/chatlearn/models/agent/examples/__init__.py
index 61f337e7..abd71aac 100644
--- a/chatlearn/models/agent/examples/__init__.py
+++ b/chatlearn/models/agent/examples/__init__.py
@@ -1,3 +1,4 @@
 """self defined agent graph"""
 from .math_eval_agent_graph import MathEvalAgentGraph
 from .math_code_agent_graph import MathCodeAgentGraph
+from .geo3k_agent_graph import Geo3kAgentGraph
diff --git a/chatlearn/models/agent/examples/geo3k_agent_graph.py b/chatlearn/models/agent/examples/geo3k_agent_graph.py
new file mode 100644
index 00000000..ac4e58ad
--- /dev/null
+++ b/chatlearn/models/agent/examples/geo3k_agent_graph.py
@@ -0,0 +1,171 @@
+# pylint: disable=arguments-differ,cell-var-from-loop,unnecessary-lambda,bare-except,missing-module-docstring,missing-class-docstring
+import asyncio
+import copy
+from typing import Annotated, Any, Sequence, TypedDict
+
+from langchain_core.messages import BaseMessage, ToolMessage
+from langchain_core.runnables import RunnableConfig
+from langchain_core.tools import tool
+from langgraph.graph import END, StateGraph
+from langgraph.graph.message import add_messages
+from omegaconf import DictConfig
+from transformers import AutoProcessor, AutoTokenizer
+
+from chatlearn.models.agent.agent_module import register
+from chatlearn.models.agent.base_agent_graph import (AgentGraphOutput,
+                                                     BaseAgentGraph)
+from chatlearn.models.agent.chat_model import CustomChatModel
+from chatlearn.utils.rule_reward_score.geo3k import acc_reward
+
+
+class AgentState(TypedDict):
+    """The state of the agent."""
+
+    # add_messages is a reducer
+    # See https://langchain-ai.github.io/langgraph/concepts/low_level/#reducers
+    messages: Annotated[Sequence[BaseMessage], add_messages]
+
+
+@register("geo3k_agent")
+class Geo3kAgentGraph(BaseAgentGraph):
+
+    def __init__(
+        self,
+        agent_name: str,
+        cfg: DictConfig,
+        llm: Any,
+        tokenizer: AutoTokenizer,
+        processor: AutoProcessor,
+        **kwargs
+    ):
+        super().__init__(agent_name, cfg, llm, tokenizer, processor, **kwargs)
+        # inject customed chat_template
+        self.processor.chat_template = cfg.chat_template
+        self.tokenizer.chat_template = cfg.chat_template
+        self.build_graph()
+
+    def build_graph(self) -> StateGraph:
+
+        self.chatmodel = CustomChatModel(
+            model=self.agent_name, llm=self.llm, tokenizer=self.tokenizer, processor=self.processor, model_type=self.model_type
+        )
+
+        # define node function
+        async def call_model(
+            state: AgentState,
+            config: RunnableConfig,
+        ):
+            chatmodel = config["configurable"]["chat_model"]
+            sampling_params = config["configurable"]["sampling_params"]
+            message = await chatmodel.ainvoke(
+                state["messages"], sampling_params=sampling_params
+            )
+            return {"messages": [message]}
+
+        @tool
+        def calc_geo3k_reward(answer: str, **kwargs):
+            """A tool used to verify whether the current result is correct"""
+            if acc_reward(answer, kwargs["ground_truth"], use_boxed=False):
+                return "The answer is correct."
+            else:
+                return "The answer is wrong."
+
+        async def tool_node(state: AgentState, config: RunnableConfig):
+
+            outputs = []
+
+            for tool_call in state["messages"][-1].tool_calls:
+                args = copy.deepcopy(tool_call["args"])
+                args["ground_truth"] = config["configurable"]["ground_truth"]
+                loop = asyncio.get_running_loop()
+                try:
+                    tool_result = await loop.run_in_executor(
+                        None,
+                        lambda: tools_by_name[tool_call["name"]].func(**args),
+                    )
+                    outputs.append(
+                        ToolMessage(
+                            content=tool_result,
+                            name=tool_call["name"],
+                            tool_call_id=tool_call["id"],
+                        )
+                    )
+                except:
+                    outputs.append(
+                        ToolMessage(
+                            content="tool execute error",
+                            name=tool_call["name"],
+                            tool_call_id=tool_call["id"],
+                        )
+                    )
+            return {"messages": outputs}
+
+        def should_continue(state: AgentState):
+            messages = state["messages"]
+            last_message = messages[-1]
+            ai_messages_cnt = 0
+            total_token_cnt = 0
+            for message in messages:
+                if message.type == "ai":
+                    ai_messages_cnt += 1
+                    total_token_cnt = len(message.response_metadata["loss_mask"])
+            # If there is no function call, then we finish
+            if (
+                not last_message.tool_calls
+                or ai_messages_cnt >= self.cfg.max_ai_message_turn
+                or total_token_cnt >= self.cfg.max_total_token_length
+            ):
+                return "end"
+            # Otherwise if there is, we continue
+            else:
+                return "continue"
+
+        # Define graph
+        workflow = StateGraph(AgentState)
+
+        # bind tool
+        tools = [calc_geo3k_reward]
+        tools_by_name = {tool.name: tool for tool in tools}
+        self.chatmodel = self.chatmodel.bind_tools(tools)
+
+        # add node
+        workflow.add_node("agent", call_model)
+        workflow.add_node("tools", tool_node)
+
+        # add edges
+        workflow.set_entry_point("agent")
+
+        workflow.add_conditional_edges(
+            # First, we define the start node. We use `agent`.
+            # This means these are the edges taken after the `agent` node is called.
+            "agent",
+            # Next, we pass in the function that will determine which node is called next.
+            should_continue,
+            {
+                # If `tools`, then we call the tool node.
+                "continue": "tools",
+                # Otherwise we finish.
+                "end": END,
+            },
+        )
+        workflow.add_edge("tools", "agent")
+        self.graph = workflow.compile()
+
+    async def run(
+        self, messages, sampling_params: dict[str, Any], **kwargs
+    ) -> AgentGraphOutput:
+
+        config = {
+            "configurable": {
+                "chat_model": self.chatmodel,
+                "sampling_params": sampling_params,
+                "ground_truth": kwargs["ground_truth"],
+            }
+        }
+
+        output = await self.graph.ainvoke(input={"messages": messages}, config=config)
+        loop = asyncio.get_running_loop()
+        output = await loop.run_in_executor(
+            None, lambda: self.convert_agent_graph_output(output)
+        )
+        return output
diff --git a/chatlearn/models/agent/examples/math_code_agent_graph.py b/chatlearn/models/agent/examples/math_code_agent_graph.py
index 53db5300..a5417253 100644
--- a/chatlearn/models/agent/examples/math_code_agent_graph.py
+++ b/chatlearn/models/agent/examples/math_code_agent_graph.py
@@ -9,7 +9,7 @@
 from langgraph.graph import END, StateGraph
 from langgraph.graph.message import add_messages
 from omegaconf import DictConfig
-from transformers import AutoTokenizer
+from transformers import AutoTokenizer, AutoProcessor
 
 from chatlearn.models.agent.agent_module import register
 from chatlearn.models.agent.base_agent_graph import (AgentGraphOutput,
@@ -34,9 +34,10 @@ def __init__(
         cfg: DictConfig,
         llm: Any,
         tokenizer: AutoTokenizer,
+        processor: AutoProcessor,
         **kwargs
     ):
-        super().__init__(agent_name, cfg, llm, tokenizer, **kwargs)
+        super().__init__(agent_name, cfg, llm, tokenizer, processor, **kwargs)
         self.build_graph()
 
     def build_graph(self) -> StateGraph:
@@ -44,7 +45,7 @@ def build_graph(self) -> StateGraph:
         # pip install agentscope==1.0.4 && pip install wandb==0.19.3
         from agentscope.tool import execute_python_code
         self.chatmodel = CustomChatModel(
-            model=self.agent_name, llm=self.llm, tokenizer=self.tokenizer
+            model=self.agent_name, llm=self.llm, tokenizer=self.tokenizer, processor=self.processor
         )
 
         # define node function
diff --git a/chatlearn/models/agent/examples/math_eval_agent_graph.py b/chatlearn/models/agent/examples/math_eval_agent_graph.py
index 964263df..94d8db15 100644
--- a/chatlearn/models/agent/examples/math_eval_agent_graph.py
+++ b/chatlearn/models/agent/examples/math_eval_agent_graph.py
@@ -9,7 +9,7 @@
 from langgraph.graph import END, StateGraph
 from langgraph.graph.message import add_messages
 from omegaconf import DictConfig
-from transformers import AutoTokenizer
+from transformers import AutoTokenizer, AutoProcessor
 
 from chatlearn.models.agent.agent_module import register
 from chatlearn.models.agent.base_agent_graph import (AgentGraphOutput,
@@ -35,15 +35,15 @@ def __init__(
         cfg: DictConfig,
         llm: Any,
         tokenizer: AutoTokenizer,
+        processor: AutoProcessor,
         **kwargs
     ):
-        super().__init__(agent_name, cfg, llm, tokenizer, **kwargs)
+        super().__init__(agent_name, cfg, llm, tokenizer, processor, **kwargs)
         self.build_graph()
 
     def build_graph(self) -> StateGraph:
-
         self.chatmodel = CustomChatModel(
-            model=self.agent_name, llm=self.llm, tokenizer=self.tokenizer
+            model=self.agent_name, llm=self.llm, tokenizer=self.tokenizer, processor=self.processor
         )
 
         # define node function
diff --git a/chatlearn/models/base_module.py b/chatlearn/models/base_module.py
index fb141f5e..171ea816 100644
--- a/chatlearn/models/base_module.py
+++ b/chatlearn/models/base_module.py
@@ -297,7 +297,8 @@ def build_dataset(self, prompts, is_eval=False):
                 max_prompt_tokens_length,
                 self.tokenizer,
                 self.processor,
-                enable_thinking=enable_thinking
+                enable_thinking=enable_thinking,
+                raw_chat=self.runtime_args.raw_chat
             )
         else:
             from chatlearn.data.prompt_dataset import PromptPipeline
diff --git a/chatlearn/models/patches/transformers/qwen2_5_vl_patch.py b/chatlearn/models/patches/transformers/qwen2_5_vl_patch.py
index 21143d8a..e62309c3 100644
--- a/chatlearn/models/patches/transformers/qwen2_5_vl_patch.py
+++ b/chatlearn/models/patches/transformers/qwen2_5_vl_patch.py
@@ -137,7 +137,6 @@ def get_rope_index(
 
         return position_ids, mrope_position_deltas
 
-
 def prepare_fa2_from_position_ids(query, key, value, position_ids):
     """
     change vl position ids to fa2 format
@@ -161,6 +160,7 @@ def prepare_fa2_from_position_ids(query, key, value, position_ids):
     # =========================================================================
 
     return (query, key, value, indices_q, (cu_seq_lens, cu_seq_lens), (max_length, max_length))
+
 def Qwen2_5_VLFlashAttention2_patched_forward(
     self,
     hidden_states: torch.Tensor,
diff --git a/docs/en/tutorial/tutorial_grpo_mcore_qwenvl.md b/docs/en/tutorial/tutorial_grpo_mcore_qwenvl.md
index 8480d6c6..06a13d05 100644
--- a/docs/en/tutorial/tutorial_grpo_mcore_qwenvl.md
+++ b/docs/en/tutorial/tutorial_grpo_mcore_qwenvl.md
@@ -17,7 +17,7 @@ You can use a VPC address to accelerate image pulling. The image address should
 ```bash
 git clone https://github.com/alibaba/ChatLearn.git
 wget http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/csrc/megatron-patch-release/0922/Pai-Megatron-Patch.tar.gz
-tar -xvf Pai-Megatron-Patch.tar
+tar -xvf Pai-Megatron-Patch.tar.gz
 ```
 
 ## Data & Model Preparation
@@ -46,7 +46,7 @@ cd ../Pai-Megatron-Patch/toolkits/distributed_checkpoints_convertor
 bash scripts/qwen2_5_vl/run_8xH20.sh \
 7B \
 ${CHATLEARN_ROOT}/pretrained_models/Qwen2.5-VL-7B-Instruct  \
-${CHATLEARN_ROOT}/pretrained_models//Qwen2.5-VL-7B-Instruct-to-mcore \
+${CHATLEARN_ROOT}/pretrained_models/Qwen2.5-VL-7B-Instruct-to-mcore \
 false  \
 true  \
 bf16
diff --git a/docs/zh/tutorial/tutorial_grpo_mcore_qwenvl.md b/docs/zh/tutorial/tutorial_grpo_mcore_qwenvl.md
index 42063e32..35d2c1b5 100644
--- a/docs/zh/tutorial/tutorial_grpo_mcore_qwenvl.md
+++ b/docs/zh/tutorial/tutorial_grpo_mcore_qwenvl.md
@@ -16,7 +16,7 @@ dsw-registry.cn-shanghai.cr.aliyuncs.com/pai-training-algorithm/chatlearn:torch2
 ```bash
 git clone https://github.com/alibaba/ChatLearn.git
 wget http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/csrc/megatron-patch-release/0922/Pai-Megatron-Patch.tar.gz
-tar -xvf Pai-Megatron-Patch.tar
+tar -xvf Pai-Megatron-Patch.tar.gz
 ```
 
 ## 数据&模型准备
@@ -44,7 +44,7 @@ cd ../Pai-Megatron-Patch/toolkits/distributed_checkpoints_convertor
 bash scripts/qwen2_5_vl/run_8xH20.sh \
 7B \
 ${CHATLEARN_ROOT}/pretrained_models/Qwen2.5-VL-7B-Instruct  \
-${CHATLEARN_ROOT}/pretrained_models//Qwen2.5-VL-7B-Instruct-to-mcore \
+${CHATLEARN_ROOT}/pretrained_models/Qwen2.5-VL-7B-Instruct-to-mcore \
 false  \
 true  \
 bf16
diff --git a/scripts/fsdp_sglang/train_fsdp_sglang_qwen2_5_vl_7b_grpo_agent.sh b/scripts/fsdp_sglang/train_fsdp_sglang_qwen2_5_vl_7b_grpo_agent.sh
new file mode 100644
index 00000000..e635c958
--- /dev/null
+++ b/scripts/fsdp_sglang/train_fsdp_sglang_qwen2_5_vl_7b_grpo_agent.sh
@@ -0,0 +1,54 @@
+#!/bin/bash
+
+set -x
+
+export CHATLEARN=$(pwd)
+export PYTHONPATH=${CHATLEARN}:${PYTHONPATH}
+source scripts/base_env.sh
+export RAY_DEDUP_LOGS=1
+export exp_name=qwen2-5-vl-grpo-7b-sglang-agent
+
+python chatlearn/entrypoint.py grpo \
+        --config-file template/grpo_fsdp.yaml \
+        runtime_args.exp_name=${exp_name} \
+        runtime_args.rollout_backend=sglang \
+        runtime_args.model_type=vlm \
+        runtime_args.raw_chat=True \
+        runtime_args.use_rollout_manager=True \
+        runtime_args.data_rerank=False \
+        runtime_args.task_type=agent \
+        runtime_args.data_path=${CHATLEARN}/dataset/geo3k/train_agent.parquet \
+        runtime_args.eval_data_path=${CHATLEARN}/dataset/geo3k/test_agent.parquet \
+        runtime_args.output_dir=${CHATLEARN}/output/${exp_name} \
+        runtime_args.num_episode=60 \
+        runtime_args.sample_per_episode=2560 \
+        runtime_args.train_global_batch_size=640 \
+        runtime_args.train_micro_batch_size=80 \
+        runtime_args.save_episode_interval=15 \
+        runtime_args.eval_episode_interval=5 \
+        runtime_args.enable_eval_before_training=False \
+        runtime_args.log_args_dict.enable_wandb=False \
+        runtime_args.log_args_dict.wandb_project=your_wandb_project \
+        models.policy_trainer.num_gpu=${num_device} \
+        models.policy_trainer.packing=True \
+        models.policy_trainer.meta_init=False \
+        models.policy_trainer.groupgemm=False \
+        models.policy_trainer.generation_batch_size=320 \
+        models.policy_trainer.ulysses_sequence_parallel_size=1 \
+        models.policy_trainer.load=${CHATLEARN}/pretrained_models/Qwen2.5-VL-7B-Instruct/ \
+        models.policy_trainer.optimizer.lr=1e-6 \
+        models.policy_trainer.pos_clip_ratio=0.27 \
+        models.policy_trainer.neg_clip_ratio=0.2 \
+        models.policy_trainer.kl_coef=0.01 \
+        models.ref_policy.generation_batch_size=320 \
+        models.policy.is_sync_mode=False \
+        models.policy.generation_batch_size=320 \
+        models.policy.enforce_eager=False \
+        models.policy.tensor_model_parallel_size=1 \
+        models.policy.max_prompt_tokens_length=1024 \
+        models.policy.max_response_tokens_length=2048 \
+        models.policy.num_inference_per_prompt=5 \
+        models.policy.gpu_memory_utilization=0.85 \
+        models.policy.enable_thinking=False \
+        models.reward.generation_batch_size=256 \
+        2>&1 | tee log_${exp_name}.log ; exit ${PIPESTATUS[0]}
diff --git a/template/agent/geo3k_eval.yaml b/template/agent/geo3k_eval.yaml
new file mode 100644
index 00000000..d3ef19f4
--- /dev/null
+++ b/template/agent/geo3k_eval.yaml
@@ -0,0 +1,3 @@
+max_total_token_length: 32768
+max_ai_message_turn: 3
+chat_template: "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{%- if tools %}{{- '<|im_start|>system\\n' }}{%- if messages[0]['role'] == 'system' %}{{- messages[0]['content'] }}{%- else %}{{- 'You are a helpful assistant.' }}{%- endif %}{{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}{%- for tool in tools %}{{- \"\\n\" }}{{- tool | tojson }}{%- endfor %}{{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}{% for message in messages %}{% if message['role'] != 'system' or loop.first == false %}{%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{%- elif message.role == \"assistant\" %}{{- '<|im_start|>' + message.role }}{%- if message.content %}{{- '\\n' + message.content }}{%- endif %}{%- for tool_call in message.tool_calls %}{%- if tool_call.function is defined %}{%- set tool_call = tool_call.function %}{%- endif %}{{- '\\n<tool_call>\\n{\"name\": \"' }}{{- tool_call.name }}{{- '\", \"arguments\": ' }}{{- tool_call.arguments | tojson }}{{- '}\\n</tool_call>' }}{%- endfor %}{{- '<|im_end|>\\n' }}{%- elif message.role == \"tool\" %}{%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}{{- '<|im_start|>user' }}{%- endif %}{{- '\\n<tool_response>\\n' }}{% if message['content'] is string %}{{ message.content }}{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif content['type'] == 'text' or 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}{% endif %}{{- '\\n</tool_response>' }}{%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}{{- '<|im_end|>\\n' }}{%- endif %}{%- endif %}{% endif %}{% endfor %}{%- else %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}{%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{%- elif message.role == \"assistant\" %}{{- '<|im_start|>' + message.role }}{%- if message.content %}{{- '\\n' + message.content }}{%- endif %}{%- for tool_call in message.tool_calls %}{%- if tool_call.function is defined %}{%- set tool_call = tool_call.function %}{%- endif %}{{- '\\n<tool_call>\\n{\"name\": \"' }}{{- tool_call.name }}{{- '\", \"arguments\": ' }}{{- tool_call.arguments | tojson }}{{- '}\\n</tool_call>' }}{%- endfor %}{{- '<|im_end|>\\n' }}{%- elif message.role == \"tool\" %}{%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}{{- '<|im_start|>user' }}{%- endif %}{{- '\\n<tool_response>\\n' }}{% if message['content'] is string %}{{ message.content }}{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif content['type'] == 'text' or 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}{% endif %}{{- '\\n</tool_response>' }}{%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}{{- '<|im_end|>\\n' }}{%- endif %}{%- endif %}{% endfor %}{%- endif %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}"
\ No newline at end of file