diff --git a/.vscode/c_cpp_properties.json b/.vscode/c_cpp_properties.json
new file mode 100644
index 0000000..01b3388
--- /dev/null
+++ b/.vscode/c_cpp_properties.json
@@ -0,0 +1,21 @@
+{
+    "configurations": [
+        {
+            "name": "Linux",
+            "includePath": [
+                "${workspaceFolder}/**",
+                "${workspaceFolder}/kuiper/include",
+                "/usr/local/cuda/include",
+                "/usr/include/c++/**",
+                "/usr/include"
+            ],
+            "defines": [],
+            "compilerPath": "/usr/bin/g++",
+            "cStandard": "c17",
+            "cppStandard": "c++17",
+            "intelliSenseMode": "linux-gcc-x64",
+            "compileCommands": "${workspaceFolder}/build/compile_commands.json"
+        }
+    ],
+    "version": 4
+}
diff --git a/.vscode/launch.json b/.vscode/launch.json
new file mode 100644
index 0000000..630ce10
--- /dev/null
+++ b/.vscode/launch.json
@@ -0,0 +1,30 @@
+{
+    // Use IntelliSense to learn about possible attributes.
+    // Hover to view descriptions of existing attributes.
+    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "name": "C++ Launch",
+            "type": "cppdbg",
+            "request": "launch",
+            "program": "/home/tangjin/KuiperLLama/build/demo/llama_infer",  // 可执行文件路径
+            "args": [],  // 可选参数
+            "stopAtEntry": false,
+            "cwd": "${workspaceFolder}",
+            "environment": [],                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               
+            "externalConsole": false,
+            "MIMode": "gdb",
+            "miDebuggerPath": "/usr/bin/gdb",  // 调试器路径，可以是gdb或lldb
+            "setupCommands": [
+                {
+                "description": "Enable pretty-printing for gdb",
+                "text": "-enable-pretty-printing",
+                "ignoreFailures": true
+                }
+            ],
+            "preLaunchTask": "build",  // 构建任务名称, 可选
+            "internalConsoleOptions": "openOnSessionStart"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/.vscode/settings.json b/.vscode/settings.json
new file mode 100644
index 0000000..565c0e0
--- /dev/null
+++ b/.vscode/settings.json
@@ -0,0 +1,118 @@
+
+{
+   
+    "files.associations": {
+        "cctype": "cpp",
+        "cmath": "cpp",
+        "cstddef": "cpp",
+        "cstdio": "cpp",
+        "cstdlib": "cpp",
+        "cstring": "cpp",
+        "ctime": "cpp",
+        "cwchar": "cpp",
+        "cwctype": "cpp",
+        "array": "cpp",
+        "atomic": "cpp",
+        "bit": "cpp",
+        "*.tcc": "cpp",
+        "bitset": "cpp",
+        "chrono": "cpp",
+        "compare": "cpp",
+        "concepts": "cpp",
+        "cstdint": "cpp",
+        "unordered_map": "cpp",
+        "vector": "cpp",
+        "exception": "cpp",
+        "algorithm": "cpp",
+        "functional": "cpp",
+        "iterator": "cpp",
+        "memory": "cpp",
+        "memory_resource": "cpp",
+        "random": "cpp",
+        "ratio": "cpp",
+        "string": "cpp",
+        "string_view": "cpp",
+        "tuple": "cpp",
+        "type_traits": "cpp",
+        "utility": "cpp",
+        "initializer_list": "cpp",
+        "iosfwd": "cpp",
+        "istream": "cpp",
+        "limits": "cpp",
+        "new": "cpp",
+        "ostream": "cpp",
+        "ranges": "cpp",
+        "sstream": "cpp",
+        "stdexcept": "cpp",
+        "streambuf": "cpp",
+        "thread": "cpp",
+        "typeinfo": "cpp",
+        "__nullptr": "cpp",
+        "iostream": "cpp",
+        "cstdarg": "cpp",
+        "clocale": "cpp",
+        "complex": "cpp",
+        "condition_variable": "cpp",
+        "deque": "cpp",
+        "list": "cpp",
+        "map": "cpp",
+        "set": "cpp",
+        "fstream": "cpp",
+        "iomanip": "cpp",
+        "mutex": "cpp",
+        "numbers": "cpp",
+        "numeric": "cpp",
+        "optional": "cpp",
+        "semaphore": "cpp",
+        "stop_token": "cpp",
+        "system_error": "cpp",
+        "typeindex": "cpp",
+        "variant": "cpp",
+        "filesystem": "cpp",
+        "any": "cpp",
+        "regex": "cpp",
+        "unordered_set": "cpp",
+        "*.inc": "cpp",
+        "*.ipp": "cpp",
+        "span": "cpp",
+        "__node_handle": "cpp",
+        "__split_buffer": "cpp",
+        "queue": "cpp",
+        "stack": "cpp",
+        "shared_mutex": "cpp",
+        "__mutex_base": "cpp",
+        "valarray": "cpp",
+        "__hash_table": "cpp",
+        "__tree": "cpp",
+        "__locale": "cpp",
+        "__string": "cpp",
+        "forward_list": "cpp",
+        "__bit_reference": "cpp",
+        "csetjmp": "cpp",
+        "csignal": "cpp",
+        "strstream": "cpp",
+        "cfenv": "cpp",
+        "cinttypes": "cpp",
+        "codecvt": "cpp",
+        "source_location": "cpp",
+        "future": "cpp",
+        "barrier": "cpp",
+        "charconv": "cpp",
+        "coroutine": "cpp",
+        "cuchar": "cpp",
+        "latch": "cpp",
+        "scoped_allocator": "cpp",
+        "syncstream": "cpp",
+        "hash_map": "cpp",
+        "hash_set": "cpp",
+        "__config": "cpp",
+        "__tuple": "cpp",
+        "ios": "cpp"
+    },
+    "C_Cpp.errorSquiggles": "disabled",
+    "window.zoomLevel": 3,
+    "window.zoomPerWindow": false,
+    "testMate.cpp.test.executables": "/home/tangjin/KuiperLLama/build/*/*",
+    "testMate.cpp.test.workingDirectory": "${absDirpath}",
+    "cmake.ctest.testExplorerIntegrationEnabled": true
+}
\ No newline at end of file
diff --git a/.vscode/task.json b/.vscode/task.json
new file mode 100644
index 0000000..ca5fa8b
--- /dev/null
+++ b/.vscode/task.json
@@ -0,0 +1,16 @@
+{
+	"version": "2.0.0",
+	"tasks": [
+		{
+			"type": "cmake",
+			"label": "build",
+			"command": "build",
+			"targets": [
+				"all"
+			],
+			"group": "build",
+			"problemMatcher": [],
+			"detail": "CMake template build task"
+		}
+	]
+}
\ No newline at end of file
diff --git a/CMakeLists.txt b/CMakeLists.txt
index ae718eb..e33ec4c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -91,7 +91,8 @@ find_package(Armadillo REQUIRED)
 aux_source_directory(kuiper/source/tensor/ DIR_TENSOR)
 aux_source_directory(kuiper/source/base/ DIR_BASE)
 aux_source_directory(kuiper/source/op/ DIR_OP)
-aux_source_directory(kuiper/source/model/ DIR_MODEL)
+file(GLOB DIR_MODEL CONFIGURE_DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/kuiper/source/model/*.cpp")
+list(REMOVE_ITEM DIR_MODEL "${CMAKE_CURRENT_SOURCE_DIR}/kuiper/source/model/model_paged.cpp")
 aux_source_directory(kuiper/source/op/kernels/cpu DIR_KERNEL_CPU)
 aux_source_directory(kuiper/source/op/kernels/cuda DIR_KERNEL_CUDA)
 aux_source_directory(kuiper/source/op/kernels/ DIR_KERNEL)
diff --git a/demo/CMakeLists.txt b/demo/CMakeLists.txt
index efaab22..859b53b 100644
--- a/demo/CMakeLists.txt
+++ b/demo/CMakeLists.txt
@@ -11,6 +11,39 @@ if (LLAMA3_SUPPORT)
 endif ()
 set_target_properties(llama_infer PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
 
+add_executable(llama_infer_paged mainpaged.cpp)
+target_link_directories(llama_infer_paged PUBLIC ${PROJECT_SOURCE_DIR}/lib)
+target_link_libraries(llama_infer_paged llama)
+if (LLAMA3_SUPPORT)
+    find_package(absl REQUIRED)
+    find_package(re2 REQUIRED)
+    find_package(nlohmann_json REQUIRED)
+    target_link_libraries(llama_infer_paged absl::base re2::re2 nlohmann_json::nlohmann_json)
+endif ()
+set_target_properties(llama_infer_paged PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+
+add_executable(llama_infer_unified main_unified.cpp)
+target_link_directories(llama_infer_unified PUBLIC ${PROJECT_SOURCE_DIR}/lib)
+target_link_libraries(llama_infer_unified llama)
+if (LLAMA3_SUPPORT)
+    find_package(absl REQUIRED)
+    find_package(re2 REQUIRED)
+    find_package(nlohmann_json REQUIRED)
+    target_link_libraries(llama_infer_unified absl::base re2::re2 nlohmann_json::nlohmann_json)
+endif ()
+set_target_properties(llama_infer_unified PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+
+add_executable(llama_infer_continuous main_continuous.cpp)
+target_link_directories(llama_infer_continuous PUBLIC ${PROJECT_SOURCE_DIR}/lib)
+target_link_libraries(llama_infer_continuous llama)
+if (LLAMA3_SUPPORT)
+    find_package(absl REQUIRED)
+    find_package(re2 REQUIRED)
+    find_package(nlohmann_json REQUIRED)
+    target_link_libraries(llama_infer_continuous absl::base re2::re2 nlohmann_json::nlohmann_json)
+endif ()
+set_target_properties(llama_infer_continuous PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+
 if (QWEN2_SUPPORT)
     message(STATUS "LINK QWEN2 SUPPORT")
     add_executable(qwen_infer main_qwen.cpp)
diff --git a/demo/main_continuous.cpp b/demo/main_continuous.cpp
new file mode 100644
index 0000000..28ba219
--- /dev/null
+++ b/demo/main_continuous.cpp
@@ -0,0 +1,352 @@
+#include <iostream>
+#include <vector>
+#include <memory>
+#include <chrono>
+#include <cuda_runtime.h>
+#include <glog/logging.h>
+#include <base/base.h>
+#include "base/batch_metadata.h"
+#include "base/kv_cache_manager.h"
+#include "base/scheduler.h"
+#include "base/scheduler_config.h"
+#include "base/sequence.h"
+#include "model/llama3.h"
+#include "model/config.h"
+
+void get_gpu_memory_info(size_t* free, size_t* total) {
+    cudaError_t status = cudaMemGetInfo(free, total);
+    if (status != cudaSuccess) {
+        std::cerr << "Error: cudaMemGetInfo failed with " << cudaGetErrorString(status) << std::endl;
+        return;
+    }
+    std::cout << ">>> [GPU] Free Memory: " << *free / (1024 * 1024) << " MB" << std::endl;
+    std::cout << ">>> [GPU] Total Memory: " << *total / (1024 * 1024) << " MB" << std::endl;
+}
+
+int main(int argc, char* argv[]) {
+    if (argc < 3) {
+        std::cout << "Usage: ./llama_infer_continuous <checkpoint_path> <tokenizer_path> [num_seqs] [max_batched_tokens]" << std::endl;
+        std::cout << "  num_seqs: number of parallel sequences (default: 4)" << std::endl;
+        std::cout << "  max_batched_tokens: max tokens per iteration (default: 512)" << std::endl;
+        return -1;
+    }
+    const char* checkpoint_path = argv[1];
+    const char* tokenizer_path = argv[2];
+    int32_t num_seqs = (argc > 3) ? std::atoi(argv[3]) : 4;
+    int32_t max_batched_tokens = (argc > 4) ? std::atoi(argv[4]) : 512;
+
+    // ==========================================
+    // 1. Initialize model
+    // ==========================================
+    std::cout << ">>> [Init] Loading model from " << checkpoint_path << std::endl;
+
+    model::LLama2Model model(base::TokenizerType::kEncodeSpe, tokenizer_path,
+                             checkpoint_path, false);
+
+    auto init_status = model.init(base::DeviceType::kDeviceCUDA);
+    if (!init_status) {
+        std::cerr << "Model init failed: " << init_status.get_err_code() << std::endl;
+        return -1;
+    }
+
+    std::cout << ">>> [Init] Model loaded successfully" << std::endl;
+
+    // ==========================================
+    // 2. Get model config
+    // ==========================================
+    const auto* config = model.config();
+    if (!config) {
+        std::cerr << "Error: Failed to get model config" << std::endl;
+        return -1;
+    }
+
+    const int32_t head_dim = config->head_size_;
+    const int32_t num_kv_heads = config->kv_head_num_;
+    const int32_t num_layers = config->layer_num_;
+    const int32_t dim = config->dim_;
+    const int32_t seq_len = config->seq_len_;
+    const int32_t kv_dim = num_kv_heads * head_dim;
+
+    std::cout << "\n>>> [Model Config]" << std::endl;
+    std::cout << "    dim            : " << dim << std::endl;
+    std::cout << "    num_layers     : " << num_layers << std::endl;
+    std::cout << "    num_kv_heads   : " << num_kv_heads << std::endl;
+    std::cout << "    head_dim       : " << head_dim << std::endl;
+    std::cout << "    seq_len        : " << seq_len << std::endl;
+
+    // ==========================================
+    // 3. Prepare input sentences
+    // ==========================================
+    std::vector<std::string> sentences = {
+        "Once upon a time, in a land far away,",
+        "The quick brown fox jumps over the lazy dog.",
+        "In the beginning, there was nothing but darkness.",
+        "A long time ago in a galaxy far, far away,",
+        "It was the best of times, it was the worst of times.",
+        "To be or not to be, that is the question.",
+        "All happy families are alike; each unhappy family is unhappy in its own way.",
+        "Call me Ishmael. Some years ago, never mind how long precisely,",
+        "Man, What can i say?",
+        "What are you doing, Ouch",
+        "Once upon a time, there was a mountain, and on the mountain, there was a temple.",
+        "It's been a long day without you, my friend.",
+        "Youth is not a time of life; it is a state of mind.",
+    };
+
+    if (num_seqs < static_cast<int32_t>(sentences.size())) {
+        sentences.resize(num_seqs);
+    }
+    int32_t actual_num_seqs = static_cast<int32_t>(sentences.size());
+
+    std::cout << "\n>>> [Input] Will process " << actual_num_seqs << " sequences" << std::endl;
+
+    // ==========================================
+    // 4. Initialize Paged KV Cache
+    // ==========================================
+    const int block_size = 16;
+
+    size_t free_mem, total_mem;
+    get_gpu_memory_info(&free_mem, &total_mem);
+
+    // Calculate activation memory based on model config
+    // max_seq_len for activation = max_batched_tokens (chunked prefill limits this)
+    int32_t max_activation_seq_len = max_batched_tokens;
+    int32_t hidden_dim = config->hidden_dim_;  // FFN intermediate dimension
+
+    size_t activation_peak = 0;
+    // 1. Hidden states
+    activation_peak += (size_t)max_activation_seq_len * dim * sizeof(float);
+    // 2. Q/K/V projection tensors
+    activation_peak += (size_t)max_activation_seq_len * dim * sizeof(float);      // Q
+    activation_peak += (size_t)max_activation_seq_len * kv_dim * sizeof(float);   // K
+    activation_peak += (size_t)max_activation_seq_len * kv_dim * sizeof(float);   // V
+    // 3. Attention output
+    activation_peak += (size_t)max_activation_seq_len * dim * sizeof(float);
+    // 4. FFN intermediate (SwiGLU has gate and up branches)
+    activation_peak += (size_t)2 * max_activation_seq_len * hidden_dim * sizeof(float);
+    // 5. Misc buffers (softmax, layernorm, etc.)
+    activation_peak += (size_t)max_activation_seq_len * dim * sizeof(float);
+
+    // Add safety margin (20%)
+    size_t reserved_memory = (size_t)(activation_peak * 1.2);
+
+    size_t one_block_bytes = (size_t)block_size * num_layers * 2 * kv_dim * sizeof(float);
+    size_t kv_cache_pool_size = (size_t)(free_mem * 0.9) - reserved_memory;
+    int total_blocks = kv_cache_pool_size / one_block_bytes;
+    int max_blocks_per_seq = (seq_len + block_size - 1) / block_size;
+
+    std::cout << "\n>>> [Memory Plan]" << std::endl;
+    std::cout << "    Activation Peak    : " << activation_peak / 1024 / 1024 << " MB" << std::endl;
+    std::cout << "    Reserved Memory    : " << reserved_memory / 1024 / 1024 << " MB" << std::endl;
+    std::cout << "    KV Cache Pool      : " << kv_cache_pool_size / 1024 / 1024 << " MB" << std::endl;
+    std::cout << "    One Block Size     : " << one_block_bytes / 1024 << " KB" << std::endl;
+    std::cout << "    Total Blocks       : " << total_blocks << std::endl;
+    std::cout << "    Max Batched Tokens : " << max_batched_tokens << std::endl;
+
+    auto kv_init_status = model.init_paged_kv_cache(actual_num_seqs, block_size, total_blocks, max_blocks_per_seq);
+    if (!kv_init_status) {
+        std::cerr << "Error: Failed to initialize paged KV cache" << std::endl;
+        return -1;
+    }
+
+    auto* kv_manager = model.get_kv_cache_manager();
+
+    std::cout << ">>> [Init] Paged KV Cache initialized" << std::endl;
+
+    // ==========================================
+    // 5. Initialize Scheduler
+    // ==========================================
+    base::SchedulerConfig sched_config;
+    sched_config.max_num_batched_tokens = max_batched_tokens;
+    sched_config.max_num_seqs = actual_num_seqs;
+    sched_config.max_model_len = seq_len;
+    sched_config.max_prefill_tokens = 128;  // Chunk size for prefill
+    sched_config.block_size = block_size;
+    sched_config.min_free_blocks = 8;
+    sched_config.decode_priority = true;
+
+    base::Scheduler scheduler(sched_config);
+    scheduler.set_kv_cache_manager(kv_manager);
+
+    // Add all requests to scheduler
+    int32_t max_new_tokens = 50;
+    std::cout << "\n>>> [Input] Encoding and adding " << actual_num_seqs << " sequences:" << std::endl;
+    for (int32_t i = 0; i < actual_num_seqs; ++i) {
+        auto prompt_tokens = model.encode(sentences[i]);
+        if (prompt_tokens.empty()) {
+            std::cerr << "Error: Failed to encode sentence " << i << std::endl;
+            return -1;
+        }
+        scheduler.add_request(prompt_tokens, max_new_tokens, sentences[i]);
+        std::cout << "    Seq " << i << ": " << prompt_tokens.size() << " tokens - \""
+                  << sentences[i].substr(0, 40) << "...\"" << std::endl;
+    }
+
+    // ==========================================
+    // 6. Continuous Batching Main Loop
+    // ==========================================
+    std::cout << "\n>>> [Inference] Starting continuous batching with scheduler..." << std::endl;
+    std::cout << "    Decode Priority: " << (sched_config.decode_priority ? "ON" : "OFF") << std::endl;
+
+    int total_iterations = 0;
+    int total_prefill_tokens = 0;
+    int total_decode_tokens = 0;
+
+    auto start_time = std::chrono::steady_clock::now();
+
+    // Get allocators for BatchMetadata
+    auto alloc_gpu = base::CUDADeviceAllocatorFactory::get_instance();
+    auto alloc_cpu = base::CPUDeviceAllocatorFactory::get_instance();
+
+    while (!scheduler.all_finished()) {
+        // Get scheduling decision
+        auto sched_output = scheduler.schedule();
+        if (!sched_output.has_work()) {
+            std::cout << "    [Done] No more work to schedule" << std::endl;
+            break;
+        }
+
+        // Log preempted sequences
+        if (!sched_output.preempted_seq_ids.empty()) {
+            std::cout << "    [Preemption] Evicted " << sched_output.preempted_seq_ids.size()
+                      << " sequences due to memory pressure: ";
+            for (int32_t pid : sched_output.preempted_seq_ids) {
+                std::cout << pid << " ";
+            }
+            std::cout << std::endl;
+        }
+
+        // Build batch data from scheduler output
+        std::vector<int32_t> batch_seq_ids;
+        std::vector<int32_t> batch_token_counts;
+        std::vector<int32_t> batch_start_positions;
+        std::vector<int32_t> batch_context_lens;
+        std::vector<bool> batch_is_prefill;
+        std::vector<int32_t> all_tokens;
+
+        for (const auto& sched_seq : sched_output.scheduled_seqs) {
+            batch_seq_ids.push_back(sched_seq.kv_slot);  // Use kv_slot for KV cache indexing
+            batch_token_counts.push_back(sched_seq.num_tokens);
+            batch_start_positions.push_back(sched_seq.start_pos);
+            batch_context_lens.push_back(sched_seq.context_len);
+            batch_is_prefill.push_back(sched_seq.is_prefill);
+            all_tokens.insert(all_tokens.end(), sched_seq.tokens.begin(), sched_seq.tokens.end());
+        }
+
+        // Allocate additional blocks for decode sequences
+        for (const auto& sched_seq : sched_output.scheduled_seqs) {
+            if (!sched_seq.is_prefill) {
+                kv_manager->allocate_blocks_for_tokens(sched_seq.kv_slot, sched_seq.context_len);
+            }
+        }
+
+        // Get embeddings
+        const auto& embedding_output = model.embedding(all_tokens);
+        auto [tok_ids, tok_emb, tok_num] = embedding_output;
+
+        // Create mixed batch metadata
+        auto batch_meta = base::BatchMetadata::create_mixed(
+            batch_seq_ids,
+            batch_token_counts,
+            batch_start_positions,
+            batch_context_lens,
+            batch_is_prefill,
+            alloc_gpu,
+            alloc_cpu,
+            nullptr);
+
+        // Forward pass
+        std::vector<int32_t> next_tokens;
+        auto forward_status = model.forward_unified(tok_emb, batch_meta, next_tokens);
+        if (!forward_status) {
+            std::cerr << "Error: forward_unified failed: " << forward_status.get_err_msg() << std::endl;
+            break;
+        }
+
+        // Update scheduler state
+        scheduler.update_after_forward(sched_output, next_tokens);
+
+        // Check for EOS and max tokens
+        for (size_t i = 0; i < sched_output.scheduled_seqs.size(); ++i) {
+            const auto& sched_seq = sched_output.scheduled_seqs[i];
+            const auto* seq = scheduler.get_sequence(sched_seq.seq_id);
+            if (!seq) continue;
+
+            // Check EOS
+            if (i < next_tokens.size() && model.is_sentence_ending(next_tokens[i])) {
+                scheduler.finish_sequence(sched_seq.seq_id);
+                continue;
+            }
+
+            // Check max tokens (only for decode phase)
+            if (!sched_seq.is_prefill && seq->num_generated() >= max_new_tokens) {
+                scheduler.finish_sequence(sched_seq.seq_id);
+            }
+        }
+
+        // Log prefill completion
+        for (const auto& sched_seq : sched_output.scheduled_seqs) {
+            if (sched_seq.is_prefill) {
+                const auto* seq = scheduler.get_sequence(sched_seq.seq_id);
+                if (seq && !seq->is_prefill()) {
+                    std::cout << "    [Prefill Done] Seq " << sched_seq.seq_id
+                              << " completed prefill, starting decode" << std::endl;
+                }
+            }
+        }
+
+        total_prefill_tokens += sched_output.num_prefill_tokens;
+        total_decode_tokens += sched_output.num_decode_tokens;
+        total_iterations++;
+
+        // Progress logging every 10 iterations
+        if (total_iterations % 10 == 0) {
+            std::cout << "    [Iter " << total_iterations << "] "
+                      << "Batch: " << sched_output.num_tokens << " tokens "
+                      << "(prefill: " << sched_output.num_prefill_tokens
+                      << ", decode: " << sched_output.num_decode_tokens << ") "
+                      << "| Waiting: " << scheduler.num_waiting()
+                      << ", Running: " << scheduler.num_running()
+                      << ", Preempted: " << scheduler.num_preempted()
+                      << ", Finished: " << scheduler.num_finished()
+                      << std::endl;
+        }
+    }
+
+    auto end_time = std::chrono::steady_clock::now();
+    double total_ms = std::chrono::duration_cast<std::chrono::duration<double, std::milli>>(
+        end_time - start_time).count();
+
+    // ==========================================
+    // 7. Print generated text for each sequence
+    // ==========================================
+    std::cout << "\n>>> [Generated Text]" << std::endl;
+    auto all_seqs = scheduler.get_all_sequences();
+    for (const auto* seq : all_seqs) {
+        std::string generated_text = model.decode(seq->output_tokens);
+        std::string prompt_preview = seq->original_text.substr(0, 30);
+        std::cout << "    Seq " << seq->seq_id << " (" << seq->output_tokens.size() << " tokens): "
+                  << prompt_preview << "... " << generated_text << std::endl;
+    }
+
+    // ==========================================
+    // 8. Statistics
+    // ==========================================
+    std::cout << "\n>>> [Statistics]" << std::endl;
+    std::cout << "    Num sequences      : " << actual_num_seqs << std::endl;
+    std::cout << "    Total iterations   : " << total_iterations << std::endl;
+    std::cout << "    Total prefill tok  : " << total_prefill_tokens << std::endl;
+    std::cout << "    Total decode tok   : " << total_decode_tokens << std::endl;
+    std::cout << "    Total time         : " << total_ms << " ms" << std::endl;
+    std::cout << "    Overall throughput : " << ((total_prefill_tokens + total_decode_tokens) * 1000.0 / total_ms)
+              << " tokens/sec" << std::endl;
+
+    std::cout << "\n>>> [Scheduler Config]" << std::endl;
+    std::cout << "    max_num_batched_tokens : " << sched_config.max_num_batched_tokens << std::endl;
+    std::cout << "    max_prefill_tokens     : " << sched_config.max_prefill_tokens << std::endl;
+    std::cout << "    decode_priority        : " << (sched_config.decode_priority ? "true" : "false") << std::endl;
+
+    std::cout << "\n>>> [Success] Continuous batching with scheduler completed!" << std::endl;
+
+    return 0;
+}
\ No newline at end of file
diff --git a/demo/main_unified.cpp b/demo/main_unified.cpp
new file mode 100644
index 0000000..5d0a969
--- /dev/null
+++ b/demo/main_unified.cpp
@@ -0,0 +1,308 @@
+#include <iostream>
+#include <vector>
+#include <memory>
+#include <chrono>
+#include <cuda_runtime.h>
+#include <glog/logging.h>
+#include <base/base.h>
+#include "base/batch_metadata.h"
+#include "base/kv_cache_manager.h"
+#include "model/llama3.h"
+#include "model/config.h"
+
+void get_gpu_memory_info(size_t* free, size_t* total) {
+    cudaError_t status = cudaMemGetInfo(free, total);
+    if (status != cudaSuccess) {
+        std::cerr << "Error: cudaMemGetInfo failed with " << cudaGetErrorString(status) << std::endl;
+        return;
+    }
+    std::cout << ">>> [GPU] Free Memory: " << *free / (1024 * 1024) << " MB" << std::endl;
+    std::cout << ">>> [GPU] Total Memory: " << *total / (1024 * 1024) << " MB" << std::endl;
+}
+
+int main(int argc, char* argv[]) {
+    if (argc < 3) {
+        std::cout << "Usage: ./llama_infer_unified <checkpoint_path> <tokenizer_path> [num_seqs] [chunk_size]" << std::endl;
+        std::cout << "  num_seqs: number of parallel sequences (default: 4)" << std::endl;
+        std::cout << "  chunk_size: optional, 0 for auto (default: 0)" << std::endl;
+        return -1;
+    }
+    const char* checkpoint_path = argv[1];
+    const char* tokenizer_path = argv[2];
+    int32_t num_seqs = (argc > 3) ? std::atoi(argv[3]) : 4;
+    int32_t chunk_size = (argc > 4) ? std::atoi(argv[4]) : 0;
+
+    // ==========================================
+    // 1. Initialie model
+    // ==========================================
+    std::cout << ">>> [Init] Loading model from " << checkpoint_path << std::endl;
+
+    model::LLama2Model model(base::TokenizerType::kEncodeSpe, tokenizer_path,
+                             checkpoint_path, false);
+
+    auto init_status = model.init(base::DeviceType::kDeviceCUDA);
+    if (!init_status) {
+        std::cerr << "Model init failed: " << init_status.get_err_code() << std::endl;
+        return -1;
+    }
+
+    std::cout << ">>> [Init] Model loaded successfully" << std::endl;
+
+    // ==========================================
+    // 2. Get model config
+    // ==========================================
+    const auto* config = model.config();
+    if (!config) {
+        std::cerr << "Error: Failed to get model config" << std::endl;
+        return -1;
+    }
+
+    const int32_t head_dim = config->head_size_;
+    const int32_t num_kv_heads = config->kv_head_num_;
+    const int32_t num_heads = config->head_num_;
+    const int32_t num_layers = config->layer_num_;
+    const int32_t dim = config->dim_;
+    const int32_t hidden_dim = config->hidden_dim_;
+    const int32_t seq_len = config->seq_len_;
+    const int32_t kv_dim = num_kv_heads * head_dim;
+
+    std::cout << "\n>>> [Model Config]" << std::endl;
+    std::cout << "    dim            : " << dim << std::endl;
+    std::cout << "    hidden_dim     : " << hidden_dim << std::endl;
+    std::cout << "    num_layers     : " << num_layers << std::endl;
+    std::cout << "    num_heads      : " << num_heads << std::endl;
+    std::cout << "    num_kv_heads   : " << num_kv_heads << std::endl;
+    std::cout << "    head_dim       : " << head_dim << std::endl;
+    std::cout << "    seq_len        : " << seq_len << std::endl;
+
+    // ==========================================
+    // 3. Prepare input sentences (multiple sequences)
+    // ==========================================
+    std::vector<std::string> sentences = {
+        "Once upon a time, in a land far away,",
+        "The quick brown fox jumps over the lazy dog.",
+        "In the beginning, there was nothing but darkness.",
+        "A long time ago in a galaxy far, far away,",
+        "It was the best of times, it was the worst of times.",
+        "To be or not to be, that is the question.",
+        "All happy families are alike; each unhappy family is unhappy in its own way.",
+        "Call me Ishmael. Some years ago, never mind how long precisely,",
+        "Man, What can i say?",
+        "What are you doing, Ouch" ,
+        "Once upon a time, there was a mountain, and on the mountain, there was a temple.",
+        "It's been a long day without you, my friend.",
+        "Youth is not a time of life; it is a state of mind;it is not a matter of rosy cheeks, red lips and supple knees;it is a matter of the will, a quality of the imagination, a vigor of the emotions",
+
+    };
+
+    // Limit to requested number of sequences
+    if (num_seqs < static_cast<int32_t>(sentences.size())) {
+        sentences.resize(num_seqs);
+    }
+    int32_t actual_num_seqs = static_cast<int32_t>(sentences.size());
+
+    std::cout << "\n>>> [Input] Encoding " << actual_num_seqs << " sequences:" << std::endl;
+
+    std::vector<std::vector<int32_t>> all_tokens(actual_num_seqs);
+    int32_t max_prompt_len = 0;
+    for (int32_t i = 0; i < actual_num_seqs; ++i) {
+        all_tokens[i] = model.encode(sentences[i]);
+        if (all_tokens[i].empty()) {
+            std::cerr << "Error: Failed to encode sentence " << i << std::endl;
+            return -1;
+        }
+        max_prompt_len = std::max(max_prompt_len, static_cast<int32_t>(all_tokens[i].size()));
+        std::cout << "    Seq " << i << ": " << all_tokens[i].size() << " tokens - \""
+                  << sentences[i].substr(0, 40) << "...\"" << std::endl;
+    }
+
+    // ==========================================
+    // 4. Initialize Paged KV Cache
+    // ==========================================
+    const int block_size = 16;
+
+    size_t free_mem, total_mem;
+    get_gpu_memory_info(&free_mem, &total_mem);
+
+    // Calculate available blocks
+    size_t one_block_bytes = (size_t)block_size * num_layers * 2 * kv_dim * sizeof(float);
+    size_t reserved_memory = 512UL * 1024 * 1024;  // 512MB for activations
+    size_t kv_cache_pool_size = (size_t)(free_mem * 0.9) - reserved_memory;
+    int total_blocks = kv_cache_pool_size / one_block_bytes;
+    int max_blocks_per_seq = (seq_len + block_size - 1) / block_size;
+
+    std::cout << "\n>>> [Memory Plan]" << std::endl;
+    std::cout << "    KV Cache Pool   : " << kv_cache_pool_size / 1024 / 1024 << " MB" << std::endl;
+    std::cout << "    Total Blocks    : " << total_blocks << std::endl;
+    std::cout << "    Max Context     : " << total_blocks * block_size << " Tokens" << std::endl;
+
+    auto kv_init_status = model.init_paged_kv_cache(actual_num_seqs, block_size, total_blocks, max_blocks_per_seq);
+    if (!kv_init_status) {
+        std::cerr << "Error: Failed to initialize paged KV cache" << std::endl;
+        return -1;
+    }
+
+    std::cout << ">>> [Init] Paged KV Cache initialized for " << actual_num_seqs << " sequences" << std::endl;
+
+    // ==========================================
+    // 5. Chunked Prefill for all sequences
+    // ==========================================
+    std::cout << "\n>>> [Inference] Starting chunked prefill for all sequences..." << std::endl;
+
+    auto* kv_manager = model.get_kv_cache_manager();
+    std::vector<int32_t> next_tokens(actual_num_seqs, -1);
+    std::vector<int32_t> prompt_lens(actual_num_seqs);
+
+    auto prefill_start = std::chrono::steady_clock::now();
+    int32_t total_prefill_tokens = 0;
+
+    for (int32_t seq_id = 0; seq_id < actual_num_seqs; ++seq_id) {
+        const auto& tokens = all_tokens[seq_id];
+        prompt_lens[seq_id] = static_cast<int32_t>(tokens.size());
+        total_prefill_tokens += prompt_lens[seq_id];
+
+        // Get embeddings
+        const auto& prompt_embedding = model.embedding(tokens);
+        auto [input_tokens, input_embeddings, input_token_num] = prompt_embedding;
+
+        // Compute chunk size if auto
+        int32_t seq_chunk_size = chunk_size;
+        if (seq_chunk_size <= 0) {
+            seq_chunk_size = model.compute_chunk_size(prompt_lens[seq_id], actual_num_seqs);
+        }
+
+        // Allocate blocks for the entire prompt
+        if (!kv_manager->allocate_blocks_for_tokens(seq_id, prompt_lens[seq_id])) {
+            std::cerr << "Error: Failed to allocate KV cache blocks for seq " << seq_id << std::endl;
+            return -1;
+        }
+
+        // Chunked prefill
+        auto prefill_status = model.forward_chunked_prefill(
+            seq_id,
+            input_embeddings,
+            0,  // start_pos
+            seq_chunk_size,
+            next_tokens[seq_id]);
+
+        if (!prefill_status) {
+            std::cerr << "Error: Chunked prefill failed for seq " << seq_id << ": "
+                      << prefill_status.get_err_msg() << std::endl;
+            return -1;
+        }
+    }
+
+    auto prefill_end = std::chrono::steady_clock::now();
+    double prefill_ms = std::chrono::duration_cast<std::chrono::duration<double, std::milli>>(
+        prefill_end - prefill_start).count();
+
+    std::cout << "    [Prefill] Processed " << total_prefill_tokens << " tokens across "
+              << actual_num_seqs << " sequences in " << prefill_ms << " ms" << std::endl;
+    std::cout << "    [Prefill] Throughput: " << (total_prefill_tokens * 1000.0 / prefill_ms)
+              << " tokens/sec" << std::endl;
+
+    // ==========================================
+    // 6. Batch Decode phase
+    // ==========================================
+    std::cout << "\n>>> [Decode] Generating tokens (batch decode)..." << std::endl;
+
+    std::vector<std::vector<int32_t>> generated_tokens(actual_num_seqs);
+    std::vector<int32_t> positions(actual_num_seqs);
+    std::vector<bool> finished(actual_num_seqs, false);
+
+    for (int32_t i = 0; i < actual_num_seqs; ++i) {
+        positions[i] = prompt_lens[i];
+    }
+
+    int max_new_tokens = 50;
+    int total_generated = 0;
+
+    auto decode_start = std::chrono::steady_clock::now();
+
+    for (int step = 0; step < max_new_tokens; ++step) {
+        // Collect active sequences
+        std::vector<int32_t> active_seq_ids;
+        std::vector<int32_t> active_positions;
+        std::vector<int32_t> active_tokens;
+
+        for (int32_t i = 0; i < actual_num_seqs; ++i) {
+            if (!finished[i]) {
+                // Check for end of sentence
+                if (model.is_sentence_ending(next_tokens[i])) {
+                    finished[i] = true;
+                    continue;
+                }
+
+                generated_tokens[i].push_back(next_tokens[i]);
+                active_seq_ids.push_back(i);
+                active_positions.push_back(positions[i]);
+                active_tokens.push_back(next_tokens[i]);
+            }
+        }
+
+        if (active_seq_ids.empty()) {
+            std::cout << "    [Decode] All sequences finished" << std::endl;
+            break;
+        }
+
+        // Get embeddings for all active tokens
+        const auto& token_embedding = model.embedding(active_tokens);
+        auto [tok_ids, tok_emb, tok_num] = token_embedding;
+
+        // Allocate blocks for new tokens
+        for (size_t i = 0; i < active_seq_ids.size(); ++i) {
+            int32_t seq_id = active_seq_ids[i];
+            kv_manager->allocate_blocks_for_tokens(seq_id, positions[seq_id] + 1);
+        }
+
+        // Batch decode
+        std::vector<int32_t> batch_next_tokens;
+        auto decode_status = model.forward_decode(active_seq_ids, active_positions, tok_emb, batch_next_tokens);
+        if (!decode_status) {
+            std::cerr << "Error: Batch decode failed: " << decode_status.get_err_msg() << std::endl;
+            break;
+        }
+
+        // Update next tokens and positions
+        for (size_t i = 0; i < active_seq_ids.size(); ++i) {
+            int32_t seq_id = active_seq_ids[i];
+            if (i < batch_next_tokens.size()) {
+                next_tokens[seq_id] = batch_next_tokens[i];
+            }
+            kv_manager->set_context_len(seq_id, positions[seq_id] + 1);
+            positions[seq_id] += 1;
+            total_generated++;
+        }
+    }
+
+    auto decode_end = std::chrono::steady_clock::now();
+    double decode_ms = std::chrono::duration_cast<std::chrono::duration<double, std::milli>>(
+        decode_end - decode_start).count();
+
+    // ==========================================
+    // 7. Print generated text for each sequence
+    // ==========================================
+    std::cout << "\n>>> [Generated Text]" << std::endl;
+    for (int32_t i = 0; i < actual_num_seqs; ++i) {
+        std::string generated_text = model.decode(generated_tokens[i]);
+        std::cout << "    Seq " << i << " (" << generated_tokens[i].size() << " tokens): "
+                  << sentences[i].substr(0, 30) << "... " << generated_text << std::endl;
+    }
+
+    // ==========================================
+    // 8. Statistics
+    // ==========================================
+    std::cout << "\n>>> [Statistics]" << std::endl;
+    std::cout << "    Num sequences    : " << actual_num_seqs << std::endl;
+    std::cout << "    Total prompt     : " << total_prefill_tokens << " tokens" << std::endl;
+    std::cout << "    Total generated  : " << total_generated << " tokens" << std::endl;
+    std::cout << "    Decode time      : " << decode_ms << " ms" << std::endl;
+    if (total_generated > 0) {
+        std::cout << "    Decode throughput: " << (total_generated * 1000.0 / decode_ms)
+                  << " tokens/sec" << std::endl;
+    }
+
+    std::cout << "\n>>> [Success] Multi-sequence unified forward inference completed!" << std::endl;
+
+    return 0;
+}
\ No newline at end of file
diff --git a/demo/mainpaged.cpp b/demo/mainpaged.cpp
new file mode 100644
index 0000000..7703979
--- /dev/null
+++ b/demo/mainpaged.cpp
@@ -0,0 +1,269 @@
+#include <iostream>
+#include <vector>
+#include <memory>
+#include <cuda_runtime.h>
+#include <glog/logging.h>
+#include <base/base.h>
+#include "base/tick.h"
+#include "base/block_allocator.h"
+#include "base/kv_cache_manager.h"
+#include "model/llama3.h"
+#include "model/config.h"
+
+void get_gpu_memory_info(size_t* free, size_t* total) {
+    cudaError_t status = cudaMemGetInfo(free, total);
+    if (status != cudaSuccess) {
+        std::cerr << "Error: cudaMemGetInfo failed with " << cudaGetErrorString(status) << std::endl;
+        return;
+    }
+    std::cout << ">>> [GPU] Free Memory: " << *free / (1024 * 1024) << " MB" << std::endl;
+    std::cout << ">>> [GPU] Total Memory: " << *total / (1024 * 1024) << " MB" << std::endl;
+}
+
+int main(int argc, char* argv[]) {
+    if (argc != 3) {
+        std::cout << "Usage: ./llama_infer_paged <checkpoint_path> <tokenizer_path>" << std::endl;
+        return -1;
+    }
+    const char* checkpoint_path = argv[1];
+    const char* tokenizer_path = argv[2];
+
+    // ==========================================
+    // 1. 初始化模型
+    // ==========================================
+    std::cout << ">>> [Init] Loading model from " << checkpoint_path << std::endl;
+
+    // 创建模型实例
+    model::LLama2Model model(base::TokenizerType::kEncodeSpe, tokenizer_path,
+                             checkpoint_path, false);
+
+    auto init_status = model.init(base::DeviceType::kDeviceCUDA);
+    if (!init_status) {
+        std::cerr << "Model init failed: " << init_status.get_err_code() << std::endl;
+        return -1;
+    }
+
+    std::cout << ">>> [Init] Model loaded successfully" << std::endl;
+
+    // ==========================================
+    // 2. 从模型配置获取参数
+    // ==========================================
+    const auto* config = model.config();
+    if (!config) {
+        std::cerr << "Error: Failed to get model config" << std::endl;
+        return -1;
+    }
+
+    const int32_t head_dim = config->head_size_;
+    const int32_t num_kv_heads = config->kv_head_num_;
+    const int32_t num_heads = config->head_num_;
+    const int32_t num_layers = config->layer_num_;
+    const int32_t dim = config->dim_;
+    const int32_t hidden_dim = config->hidden_dim_;
+    const int32_t seq_len = config->seq_len_;
+
+    std::cout << "\n>>> [Model Config]" << std::endl;
+    std::cout << "    dim            : " << dim << std::endl;
+    std::cout << "    hidden_dim     : " << hidden_dim << std::endl;
+    std::cout << "    num_layers     : " << num_layers << std::endl;
+    std::cout << "    num_heads      : " << num_heads << std::endl;
+    std::cout << "    num_kv_heads   : " << num_kv_heads << std::endl;
+    std::cout << "    head_dim       : " << head_dim << std::endl;
+    std::cout << "    seq_len        : " << seq_len << std::endl;
+
+    // ==========================================
+    // 3. 准备输入句子并编码
+    // ==========================================
+    const std::string sentence = "Once upon a time, in a land far away,";
+    std::cout << "\n>>> [Input] Encoding sentence: \"" << sentence << "\"" << std::endl;
+
+    auto tokens = model.encode(sentence);
+    if (tokens.empty()) {
+        std::cerr << "Error: Failed to encode sentence" << std::endl;
+        return -1;
+    }
+
+    std::cout << "    [Encode] Token count: " << tokens.size() << std::endl;
+
+    // ==========================================
+    // 4. GPU 内存规划和 Paged KV Cache 初始化
+    // ==========================================
+    std::cout << "\n>>> [Memory Planning] Calculating KV cache allocation..." << std::endl;
+
+    const int block_size = 16;  // 每个物理块包含的 token 数量
+
+    // 4.1 预留显存 (Activation + Overhead)
+    const int max_seq_len = seq_len;  // 使用模型配置的 seq_len
+
+    // Activation memory estimation (Prefill 阶段峰值)
+    const int32_t kv_dim = num_kv_heads * head_dim;
+    size_t activation_peak = 0;
+
+    // 1. Hidden states
+    activation_peak += (size_t)max_seq_len * dim * sizeof(float);
+
+    // 2. Q/K/V 投影中间张量
+    activation_peak += (size_t)max_seq_len * dim * sizeof(float);      // Q
+    activation_peak += (size_t)max_seq_len * kv_dim * sizeof(float);   // K
+    activation_peak += (size_t)max_seq_len * kv_dim * sizeof(float);   // V
+
+    // 3. Attention scores (O(n²) - Prefill 阶段最大的内存消耗)
+    activation_peak += (size_t)num_heads * max_seq_len * max_seq_len * sizeof(float);
+
+    // 4. FFN 中间层 (SwiGLU 有 gate 和 up 两个分支)
+    activation_peak += (size_t)2 * max_seq_len * hidden_dim * sizeof(float);
+
+    // 5. 固定开销 (cuBLAS workspace, 临时 buffer 等)
+    const size_t fixed_overhead = 256UL * 1024 * 1024;  // 256MB
+    const size_t reserved_memory = activation_peak + fixed_overhead;
+
+    std::cout << "    [Profile] Estimated Activation Reserve: " << reserved_memory / 1024 / 1024 << " MB" << std::endl;
+
+    // 4.2 计算实际可用显存
+    float gpu_memory_utilization = 0.90;
+
+    size_t free_mem, total_mem;
+    get_gpu_memory_info(&free_mem, &total_mem);
+
+    size_t available_mem = (size_t)(free_mem * gpu_memory_utilization);
+
+    if (available_mem <= reserved_memory) {
+        std::cerr << "Error: Not enough memory! Available: " << available_mem / 1024 / 1024
+                  << "MB, Required: " << reserved_memory / 1024 / 1024 << "MB" << std::endl;
+        return -1;
+    }
+
+    size_t kv_cache_pool_size = available_mem - reserved_memory;
+
+    // 计算单个 Block 的物理大小
+    // Layout: [max_blocks, num_layers, 2 (K+V), block_size, kv_dim]
+    size_t one_block_bytes = (size_t)block_size * num_layers * 2 * kv_dim * sizeof(float);
+
+    // 计算 Block 数量
+    int total_blocks = kv_cache_pool_size / one_block_bytes;
+
+    std::cout << "\n>>> [Memory Plan]" << "\n"
+              << "    Total GPU Mem   : " << total_mem / 1024 / 1024 << " MB\n"
+              << "    Free GPU Mem    : " << free_mem / 1024 / 1024 << " MB\n"
+              << "    Reserved Mem    : " << reserved_memory / 1024 / 1024 << " MB\n"
+              << "    KV Cache Pool   : " << kv_cache_pool_size / 1024 / 1024 << " MB\n"
+              << "    Block Size      : " << one_block_bytes / 1024 << " KB\n"
+              << "    >>> Total Blocks: " << total_blocks << "\n"
+              << "    >>> Max Context : " << total_blocks * block_size << " Tokens\n" << std::endl;
+
+    if (total_blocks <= 0) {
+        std::cerr << "Error: No memory left for KV Cache!" << std::endl;
+        return -1;
+    }
+
+    // ==========================================
+    // 5. 初始化 Paged KV Cache
+    // ==========================================
+    const int num_seqs = 1;  // 当前只支持单个序列
+    int max_blocks_per_seq = (max_seq_len + block_size - 1) / block_size;
+
+    std::cout << ">>> [Init] Initializing Paged KV Cache..." << std::endl;
+    std::cout << "    num_seqs          : " << num_seqs << std::endl;
+    std::cout << "    block_size        : " << block_size << std::endl;
+    std::cout << "    total_blocks      : " << total_blocks << std::endl;
+    std::cout << "    max_blocks_per_seq: " << max_blocks_per_seq << std::endl;
+
+    auto kv_init_status = model.init_paged_kv_cache(num_seqs, block_size, total_blocks, max_blocks_per_seq);
+    if (!kv_init_status) {
+        std::cerr << "Error: Failed to initialize paged KV cache: " << kv_init_status.get_err_code() << std::endl;
+        return -1;
+    }
+
+    std::cout << ">>> [Init] Paged KV Cache initialized successfully" << std::endl;
+
+    // ==========================================
+    // 6. 开始推理：Prefill 阶段
+    // ==========================================
+    std::cout << "\n>>> [Inference] Starting generation..." << std::endl;
+
+    int32_t prompt_len = static_cast<int32_t>(tokens.size());
+    int32_t pos = 0;
+    int32_t next = -1;
+    bool is_prompt = true;
+    const int total_steps = 128;  // 最大生成步数
+
+    // 获取 prompt embedding
+    const auto& prompt_embedding = model.embedding(tokens);
+    tensor::Tensor pos_tensor = model.get_buffer(model::ModelBufferType::kInputPos);
+
+    std::vector<int32_t> generated_tokens;
+
+    // Prefill 阶段：处理 prompt tokens
+    while (pos < prompt_len - 1) {
+        pos_tensor.index<int32_t>(0) = pos;
+
+        tensor::Tensor input = model.fill_input(pos_tensor, prompt_embedding, is_prompt);
+        model.predict(input, pos_tensor, is_prompt, next);
+
+        next = tokens.at(pos + 1);
+        generated_tokens.push_back(next);
+        pos += 1;
+    }
+
+    std::cout << "    [Prefill] Processed " << pos << " prompt tokens" << std::endl;
+
+    // ==========================================
+    // 7. Decode 阶段：逐个生成新 token
+    // ==========================================
+    is_prompt = false;
+    int max_new_tokens = 50;  // 最多生成 50 个新 token
+
+    std::cout << "\n>>> [Decode] Generating tokens..." << std::endl;
+    std::cout << "Generated: ";
+    fflush(stdout);
+
+    // 用于增量解码：记录上次解码的字符串长度
+    std::string prev_decoded = "";
+
+    for (int step = 0; step < max_new_tokens && pos < total_steps; ++step) {
+        // 更新位置
+        pos_tensor.index<int32_t>(0) = pos;
+
+        // 使用上一步生成的 token 作为输入
+        std::vector<int32_t> current_token = {next};
+        const auto& token_embedding = model.embedding(current_token);
+        tensor::Tensor input = model.fill_input(pos_tensor, token_embedding, is_prompt);
+
+        // 执行推理（KVCacheManager 会自动管理块分配）
+        model.predict(input, pos_tensor, is_prompt, next);
+
+        // 检查是否结束
+        if (model.is_sentence_ending(next)) {
+            std::cout << std::endl;
+            std::cout << "    [Decode] Reached end of sentence token" << std::endl;
+            break;
+        }
+
+        // 添加生成的 token
+        generated_tokens.push_back(next);
+
+        // 增量解码：解码所有已生成的 token，只输出新增部分
+        std::string full_decoded = model.decode(generated_tokens);
+        if (full_decoded.length() > prev_decoded.length()) {
+            std::cout << full_decoded.substr(prev_decoded.length());
+            fflush(stdout);
+        }
+        prev_decoded = full_decoded;
+
+        pos += 1;
+    }
+
+    std::cout << std::endl;
+
+    // ==========================================
+    // 8. 输出统计信息
+    // ==========================================
+    std::cout << "\n>>> [Statistics]" << std::endl;
+    std::cout << "    Prompt tokens    : " << prompt_len << std::endl;
+    std::cout << "    Generated tokens : " << generated_tokens.size() << std::endl;
+    std::cout << "    Total tokens     : " << pos + 1 << std::endl;
+
+    std::cout << "\n>>> [Success] Paged attention inference completed!" << std::endl;
+
+    return 0;
+}
\ No newline at end of file
diff --git a/kuiper/include/base/base.h b/kuiper/include/base/base.h
index 53ab5d8..a112f48 100644
--- a/kuiper/include/base/base.h
+++ b/kuiper/include/base/base.h
@@ -26,9 +26,36 @@ enum class ModelBufferType {
   kFFNRMSNorm = 13,
   kForwardOutput = 15,
   kForwardOutputCPU = 16,
-
   kSinCache = 17,
   kCosCache = 18,
+  kPagedKVCache = 19,
+  kPagedBlockTables = 20,
+  kPagedAttentionContextLens = 21,
+  // Batch buffers for multi-sequence processing
+  kBatchQuery = 22,           // [num_seqs, dim]
+  kBatchKey = 23,             // [num_seqs, kv_dim]
+  kBatchValue = 24,           // [num_seqs, kv_dim]
+  kBatchOutputMHA = 25,       // [num_seqs, dim]
+  kBatchAttnOutput = 26,      // [num_seqs, dim]
+  kBatchRMSNormOutput = 27,   // [num_seqs, dim]
+  kBatchW1Output = 28,        // [num_seqs, hidden_dim]
+  kBatchW3Output = 29,        // [num_seqs, hidden_dim]
+  kBatchW2Output = 30,        // [num_seqs, dim]
+  kBatchFFNRMSNorm = 31,      // [num_seqs, dim]
+  kBatchInputEmbeddings = 32, // [num_seqs, dim]
+  kBatchPositions = 33,       // [num_seqs] - positions for each sequence
+  kBatchForwardOutput = 34,   // [num_seqs, vocab_size] - logits for each sequence
+  // Unified forward buffers (for chunked prefill / continuous batching)
+  kUnifiedQuery = 35,         // [num_tokens, dim]
+  kUnifiedKey = 36,           // [num_tokens, kv_dim]
+  kUnifiedValue = 37,         // [num_tokens, kv_dim]
+  kUnifiedRMSNormOutput = 38, // [num_tokens, dim]
+  kUnifiedMHAOutput = 39,     // [num_tokens, dim]
+  kUnifiedAttnOutput = 40,    // [num_tokens, dim]
+  kUnifiedFFNRMSNorm = 41,    // [num_tokens, dim]
+  kUnifiedW1Output = 42,      // [num_tokens, hidden_dim]
+  kUnifiedW3Output = 43,      // [num_tokens, hidden_dim]
+  kUnifiedW2Output = 44,      // [num_tokens, dim]
 };
 }
 
diff --git a/kuiper/include/base/batch_metadata.h b/kuiper/include/base/batch_metadata.h
new file mode 100644
index 0000000..c93c08c
--- /dev/null
+++ b/kuiper/include/base/batch_metadata.h
@@ -0,0 +1,75 @@
+#ifndef KUIPER_INCLUDE_BASE_BATCH_METADATA_H_
+#define KUIPER_INCLUDE_BASE_BATCH_METADATA_H_
+
+#include <memory>
+#include <vector>
+#include "base/alloc.h"
+#include "tensor/tensor.h"
+
+namespace base {
+
+// Unified batch metadata for forward pass
+// Supports: single decode, batch decode, chunked prefill, mixed prefill+decode
+struct BatchMetadata {
+  int32_t num_tokens = 0;  // Total number of tokens in this batch
+  int32_t num_seqs = 0;    // Number of sequences
+
+  // GPU tensors
+  tensor::Tensor seq_ids;         // [num_tokens] - sequence index for each token
+  tensor::Tensor positions;       // [num_tokens] - position for each token
+  tensor::Tensor seq_start_locs;  // [num_seqs + 1] - start location of each sequence in tokens
+
+  // CPU copies for host-side logic
+  std::vector<int32_t> seq_ids_cpu;
+  std::vector<int32_t> positions_cpu;
+  std::vector<int32_t> seq_start_locs_cpu;
+  std::vector<int32_t> seq_lens_cpu;    // Current context length for each sequence
+  std::vector<bool> is_prefill_cpu;     // Whether each sequence is in prefill phase
+
+  // Query methods
+  bool has_prefill() const;
+  bool has_decode() const;
+  int32_t max_prefill_len() const;
+  int32_t max_context_len() const;
+  int32_t get_num_prefill_tokens() const;
+  int32_t get_num_decode_tokens() const;
+
+  // Get the last token index for each sequence (for logits computation)
+  std::vector<int32_t> get_last_token_indices() const;
+
+  // Factory methods
+  static BatchMetadata create_decode(
+      const std::vector<int32_t>& seq_ids,
+      const std::vector<int32_t>& positions,
+      const std::vector<int32_t>& context_lens,
+      std::shared_ptr<DeviceAllocator> alloc_gpu,
+      std::shared_ptr<DeviceAllocator> alloc_cpu,
+      void* stream = nullptr);
+
+  static BatchMetadata create_prefill(
+      int32_t seq_id,
+      int32_t start_pos,
+      int32_t num_tokens,
+      int32_t context_len,
+      std::shared_ptr<DeviceAllocator> alloc_gpu,
+      std::shared_ptr<DeviceAllocator> alloc_cpu,
+      void* stream = nullptr);
+
+  static BatchMetadata create_mixed(
+      const std::vector<int32_t>& seq_ids,
+      const std::vector<int32_t>& token_counts,  // Number of tokens per sequence
+      const std::vector<int32_t>& start_positions,
+      const std::vector<int32_t>& context_lens,
+      const std::vector<bool>& is_prefill,
+      std::shared_ptr<DeviceAllocator> alloc_gpu,
+      std::shared_ptr<DeviceAllocator> alloc_cpu,
+      void* stream = nullptr);
+
+ private:
+  void sync_to_device(std::shared_ptr<DeviceAllocator> alloc_gpu,
+                      std::shared_ptr<DeviceAllocator> alloc_cpu,
+                      void* stream);
+};
+
+}  // namespace base
+#endif  // KUIPER_INCLUDE_BASE_BATCH_METADATA_H_
\ No newline at end of file
diff --git a/kuiper/include/base/block_allocator.h b/kuiper/include/base/block_allocator.h
new file mode 100644
index 0000000..66b1331
--- /dev/null
+++ b/kuiper/include/base/block_allocator.h
@@ -0,0 +1,151 @@
+/**
+ * @author    jintang-coder
+ *   - BlockAllocator: Physical memory block pool management
+ *
+ * =============================================================================
+ * BlockAllocator 物理块分配器
+ * =============================================================================
+ *
+ * 功能: 管理GPU显存中的物理块池，类似操作系统的页帧分配器
+ *
+ * 核心数据结构:
+ *   class BlockAllocator {
+ *       int total_blocks_;           // 总块数
+ *       int block_size_;             // 每块可存储的token数
+ *       std::vector<int> free_blocks_; // 空闲块ID栈
+ *   };
+ *
+ * ---
+ * 具体例子: 初始化与分配过程
+ *
+ * 假设配置:
+ *   total_blocks = 100
+ *   block_size = 16 (每块存16个token的KV)
+ *
+ * 初始化后:
+ *   free_blocks_ = [99, 98, 97, ..., 2, 1, 0]  // 栈结构，后进先出
+ *
+ * ---
+ * 分配场景1: Prefill阶段 - 一次分配多块
+ *
+ *   // 序列有50个token，需要 ceil(50/16) = 4 个blocks
+ *   std::vector<int> blocks = allocator.allocate(4);
+ *   // 返回 blocks = [0, 1, 2, 3]
+ *   // free_blocks_ = [99, 98, ..., 5, 4]
+ *
+ * ---
+ * 分配场景2: Decode阶段 - 单块分配
+ *
+ *   // 当前token位置超出已分配块，需要新块
+ *   int new_block = allocator.allocate();
+ *   // 返回 new_block = 4
+ *   // free_blocks_ = [99, 98, ..., 5]
+ *
+ * ---
+ * 释放场景: 序列完成后归还
+ *
+ *   allocator.free({0, 1, 2, 3, 4});
+ *   // free_blocks_ = [99, 98, ..., 5, 0, 1, 2, 3, 4]
+ *   // 注意: 归还顺序不影响，下次分配从栈顶取
+ *
+ * ---
+ * 与上层组件的关系:
+ *
+ *   KVCacheManager
+ *        │
+ *        │ allocate_blocks_for_tokens(seq_idx, num_tokens)
+ *        │   └─> 计算需要多少blocks
+ *        │   └─> 调用 block_alloc_->allocate()
+ *        ▼
+ *   BlockAllocator
+ *        │
+ *        │ allocate() / free()
+ *        ▼
+ *   free_blocks_ 栈
+ *
+ * ---
+ * 内存布局示意:
+ *
+ *   GPU显存 KV Cache Pool:
+ *   ┌─────────┬─────────┬─────────┬─────────┬─────────┬─────────┐
+ *   │ Block 0 │ Block 1 │ Block 2 │ Block 3 │ Block 4 │   ...   │
+ *   │ (seq0)  │ (seq0)  │ (seq1)  │ (seq1)  │ (free)  │         │
+ *   └─────────┴─────────┴─────────┴─────────┴─────────┴─────────┘
+ *
+ *   每个Block内部:
+ *   [num_layers][2(K/V)][block_size][kv_dim]
+ *
+ */
+
+#ifndef KUIPER_BASE_BLOCK_ALLOCATOR_H
+#define KUIPER_BASE_BLOCK_ALLOCATOR_H
+
+#include <vector>
+#include <mutex>
+#include <algorithm>
+
+
+namespace base {
+
+// 显存物理逻辑块管理器
+class BlockAllocator {
+public:
+    // 初始化：构建空闲块链表（栈）
+    explicit BlockAllocator(int total_blocks, int block_size)
+        : total_blocks_(total_blocks), block_size_(block_size) {
+        free_blocks_.reserve(total_blocks);
+        for (int i = 0; i < total_blocks; ++i) {
+            // 初始时所有块都是空闲的
+            free_blocks_.push_back(total_blocks - 1 - i); 
+        }
+    }
+
+    // 分配 n 个块，成功返回 block_ids，失败返回空
+    std::vector<int> allocate(int num_blocks) {
+        std::lock_guard<std::mutex> lock(mutex_);
+        if (free_blocks_.size() < num_blocks) {
+            return {}; // OOM (Out of Memory)
+        }
+        std::vector<int> allocated_blocks;
+        for (int i = 0; i < num_blocks; ++i) {
+            allocated_blocks.push_back(free_blocks_.back());
+            free_blocks_.pop_back();
+        }
+        return allocated_blocks;
+    }
+
+    // 分配单个块（用于 Decode 阶段）
+    int allocate() {
+        std::lock_guard<std::mutex> lock(mutex_);
+        if (free_blocks_.empty()) {
+            return -1; // OOM
+        }
+        int block_id = free_blocks_.back();
+        free_blocks_.pop_back();
+        return block_id;
+    }
+
+    // 释放块，归还到池中
+    void free(const std::vector<int>& blocks) {
+        std::lock_guard<std::mutex> lock(mutex_);
+        for (int block_id : blocks) {
+            free_blocks_.push_back(block_id);
+        }
+    }
+
+    int get_free_block_num() const {
+        return free_blocks_.size();
+    }
+
+    int block_size() const { return block_size_; }
+
+private:
+    int total_blocks_;
+    int block_size_;
+    std::vector<int> free_blocks_; // 空闲块 ID 栈
+    std::mutex mutex_;
+};
+
+} // namespace base
+
+#endif // KUIPER_BASE_BLOCK_ALLOCATOR_H
\ No newline at end of file
diff --git a/kuiper/include/base/kv_cache_manager.h b/kuiper/include/base/kv_cache_manager.h
new file mode 100644
index 0000000..353a573
--- /dev/null
+++ b/kuiper/include/base/kv_cache_manager.h
@@ -0,0 +1,114 @@
+#ifndef KUIPER_BASE_KV_CACHE_MANAGER_H
+#define KUIPER_BASE_KV_CACHE_MANAGER_H
+
+#include <cstdint>
+#include <memory>
+#include <vector>
+#include "base/alloc.h"
+#include "base/block_allocator.h"
+#include "base/buffer.h"
+#include "tensor/tensor.h"
+
+namespace base {
+
+class KVCacheManager {
+public:
+  base::Status init_paged_token_major(
+      base::DeviceType device_type,
+      std::shared_ptr<base::DeviceAllocator> alloc_dev,
+      std::shared_ptr<base::DeviceAllocator> alloc_cpu,
+      int32_t num_seqs,
+      int32_t num_layers,
+      int32_t num_kv_heads,
+      int32_t head_dim,
+      int32_t block_size,
+      int32_t max_blocks,
+      int32_t max_blocks_per_seq,
+      void* stream = nullptr);
+
+  std::pair<tensor::Tensor, tensor::Tensor> slice_kv(
+      int32_t seq_idx, int32_t layer_idx, int32_t token_pos);
+
+  const tensor::Tensor& kv_cache_device() const;
+  const tensor::Tensor& block_tables_device();   // 确保已同步到 device
+  const tensor::Tensor& context_lens_device();   // 确保已同步到 device
+
+  const tensor::Tensor& block_tables_host() const;
+  const tensor::Tensor& context_lens_host() const;
+
+  void sync_tables_to_device(void* stream = nullptr); // host->device
+
+  void reset_sequence(int32_t seq_idx); // 释放该 seq 的所有 blocks，block_tables=-1, context_len=0
+  void reset_all();
+
+  // Getters for configuration
+  int32_t block_size() const;
+  int32_t max_blocks_per_seq() const;
+  int32_t kv_dim() const;
+  int32_t num_layers() const;
+  int32_t num_kv_heads() const;
+  int32_t head_dim() const;
+  int32_t num_seqs() const;
+  int32_t max_blocks() const;
+
+  // Get context length for a sequence
+  int32_t get_context_len(int32_t seq_idx) const;
+
+  // Set context length directly (useful after prefill)
+  void set_context_len(int32_t seq_idx, int32_t context_len);
+
+  // Allocate blocks for multiple tokens at once (for prefill)
+  // Returns true if allocation succeeded
+  bool allocate_blocks_for_tokens(int32_t seq_idx, int32_t num_tokens);
+
+  // Sequence slot management (for Scheduler)
+  // Allocate a free sequence slot, returns slot index or -1 if none available
+  int32_t allocate_sequence_slot();
+
+  // Release a sequence slot (frees blocks and resets state)
+  void release_sequence_slot(int32_t seq_idx);
+
+  // Check if there are free sequence slots
+  bool has_free_slot() const;
+
+  // Get number of free sequence slots
+  int32_t num_free_slots() const;
+
+  // Get number of free blocks (for memory pressure check)
+  int32_t num_free_blocks() const;
+
+private:
+  float* key_ptr_(int32_t physical_block_id, int32_t layer_idx, int32_t token_in_block);
+  float* value_ptr_(int32_t physical_block_id, int32_t layer_idx, int32_t token_in_block);
+
+  int32_t ensure_physical_block_(int32_t seq_idx, int32_t logical_block);
+
+  // Internal sync without locking (caller must hold mu_)
+  void sync_tables_to_device_unlocked_(void* stream);
+
+private:
+  mutable std::mutex mu_;
+  base::DeviceType device_type_{base::DeviceType::kDeviceUnknown};
+
+  int32_t num_seqs_{0}, num_layers_{0}, num_kv_heads_{0}, head_dim_{0};
+  int32_t kv_dim_{0};
+  int32_t block_size_{0}, max_blocks_{0}, max_blocks_per_seq_{0};
+
+  std::shared_ptr<base::DeviceAllocator> alloc_dev_;
+  std::shared_ptr<base::DeviceAllocator> alloc_cpu_;
+
+  tensor::Tensor kv_cache_dev_;          // float, device
+  tensor::Tensor block_tables_host_;     // int32, cpu
+  tensor::Tensor context_lens_host_;     // int32, cpu
+  tensor::Tensor block_tables_dev_;      // int32, device mirror
+  tensor::Tensor context_lens_dev_;      // int32, device mirror
+
+  std::unique_ptr<base::BlockAllocator> block_alloc_;
+  bool tables_dirty_{true};
+
+  // Sequence slot management
+  std::vector<int32_t> free_seq_slots_;  // Available sequence slots
+};
+
+} // namespace base
+#endif // KUIPER_BASE_KV_CACHE_MANAGER_H
diff --git a/kuiper/include/base/scheduler.h b/kuiper/include/base/scheduler.h
new file mode 100644
index 0000000..e3df799
--- /dev/null
+++ b/kuiper/include/base/scheduler.h
@@ -0,0 +1,124 @@
+#ifndef KUIPER_BASE_SCHEDULER_H
+#define KUIPER_BASE_SCHEDULER_H
+
+#include <deque>
+#include <unordered_map>
+#include <memory>
+#include <mutex>
+#include <functional>
+#include "base/scheduler_config.h"
+#include "base/sequence.h"
+
+namespace base {
+
+class KVCacheManager;
+
+class Scheduler {
+public:
+    explicit Scheduler(const SchedulerConfig& config);
+    ~Scheduler() = default;
+
+    // Set the KV cache manager (required before scheduling)
+    void set_kv_cache_manager(KVCacheManager* kv_manager);
+
+    // Add a new request to the waiting queue
+    // Returns the assigned sequence ID
+    int32_t add_request(const std::vector<int32_t>& prompt_tokens,
+                        int32_t max_new_tokens = 128,
+                        const std::string& original_text = "");
+
+    // Main scheduling function - called each iteration
+    // Returns which sequences to process and how many tokens each
+    SchedulerOutput schedule();
+
+    // Update sequence state after forward pass
+    // next_tokens: one token per scheduled sequence (in order)
+    void update_after_forward(const SchedulerOutput& output,
+                              const std::vector<int32_t>& next_tokens);
+
+    // Mark a sequence as finished (EOS or external termination)
+    void finish_sequence(int32_t seq_id);
+
+    // Get sequence by ID (for reading output tokens, etc.)
+    const Sequence* get_sequence(int32_t seq_id) const;
+
+    // Get mutable sequence by ID
+    Sequence* get_sequence_mut(int32_t seq_id);
+
+    // Check if all sequences are finished (no waiting, no running)
+    bool all_finished() const;
+
+    // Check if there's any work to do
+    bool has_pending_work() const;
+
+    // Get statistics
+    int32_t num_waiting() const;
+    int32_t num_running() const;
+    int32_t num_preempted() const;
+    int32_t num_finished() const;
+
+    // Get all finished sequences
+    std::vector<const Sequence*> get_finished_sequences() const;
+
+    // Get all sequences (for final output)
+    std::vector<const Sequence*> get_all_sequences() const;
+
+    // Get config
+    const SchedulerConfig& config() const { return config_; }
+
+private:
+    // Check if we can admit a new sequence from waiting queue
+    bool can_admit_new_sequence() const;
+
+    // Estimate blocks needed for a sequence
+    int32_t estimate_blocks_needed(const Sequence& seq, int32_t additional_tokens) const;
+
+    // Schedule decode sequences (priority)
+    void schedule_decode(SchedulerOutput& output, int32_t& remaining_tokens, int32_t& remaining_seqs);
+
+    // Schedule prefill sequences (or chunks)
+    void schedule_prefill(SchedulerOutput& output, int32_t& remaining_tokens, int32_t& remaining_seqs);
+
+    // Try to admit new sequences from waiting queue
+    void try_admit_waiting(SchedulerOutput& output, int32_t& remaining_tokens, int32_t& remaining_seqs);
+
+    // Try to re-admit preempted sequences (priority over waiting)
+    void try_admit_preempted(SchedulerOutput& output, int32_t& remaining_tokens, int32_t& remaining_seqs);
+
+    // Move sequence from waiting to running
+    bool admit_sequence(Sequence& seq);
+
+    // Add a sequence to the scheduled output
+    void add_to_schedule(SchedulerOutput& output, Sequence& seq,
+                         int32_t num_tokens, bool is_prefill);
+
+    // Preemption: evict sequences when memory is tight
+    // Returns number of sequences evicted
+    int32_t preempt_sequences(int32_t blocks_needed);
+
+    // Select victim sequence for preemption (lowest priority = least generated tokens)
+    int32_t select_victim_sequence() const;
+
+    // Check if memory pressure requires preemption
+    bool need_preemption(int32_t blocks_needed) const;
+
+private:
+    SchedulerConfig config_;
+    KVCacheManager* kv_manager_ = nullptr;
+
+    // Sequence storage
+    std::unordered_map<int32_t, Sequence> sequences_;
+    int32_t next_seq_id_ = 0;
+
+    // Queues (store seq_ids, not Sequence objects)
+    std::deque<int32_t> waiting_queue_;    // FIFO for new requests
+    std::deque<int32_t> running_queue_;    // Active sequences
+    std::deque<int32_t> preempted_queue_;  // Preempted sequences (priority over waiting)
+    std::vector<int32_t> finished_seqs_;   // Completed sequences
+
+    mutable std::mutex mutex_;
+};
+
+}  // namespace base
+
+#endif  // KUIPER_BASE_SCHEDULER_H
\ No newline at end of file
diff --git a/kuiper/include/base/scheduler_config.h b/kuiper/include/base/scheduler_config.h
new file mode 100644
index 0000000..631c8e3
--- /dev/null
+++ b/kuiper/include/base/scheduler_config.h
@@ -0,0 +1,36 @@
+#ifndef KUIPER_BASE_SCHEDULER_CONFIG_H
+#define KUIPER_BASE_SCHEDULER_CONFIG_H
+
+#include <cstdint>
+
+namespace base {
+
+struct SchedulerConfig {
+    // Maximum number of tokens to process in a single iteration
+    // This limits GPU memory usage for activations
+    int32_t max_num_batched_tokens = 512;
+
+    // Maximum number of sequences in a single batch
+    int32_t max_num_seqs = 256;
+
+    // Maximum sequence length (context + generated)
+    int32_t max_model_len = 4096;
+
+    // Maximum tokens for a single prefill chunk
+    // Smaller chunks allow decode requests to interleave
+    int32_t max_prefill_tokens = 128; 
+
+    // Minimum free blocks to keep as buffer
+    // Prevents OOM during decode phase
+    int32_t min_free_blocks = 8;
+
+    // Block size for KV cache (must match KVCacheManager)
+    int32_t block_size = 16;
+
+    // Policy: true mean decode first , false = FCFS
+    bool decode_priority = true;
+};
+
+} 
+
+#endif  // KUIPER_BASE_SCHEDULER_CONFIG_H
diff --git a/kuiper/include/base/sequence.h b/kuiper/include/base/sequence.h
new file mode 100644
index 0000000..9acf179
--- /dev/null
+++ b/kuiper/include/base/sequence.h
@@ -0,0 +1,116 @@
+#ifndef KUIPER_BASE_SEQUENCE_H
+#define KUIPER_BASE_SEQUENCE_H
+
+#include <cstdint>
+#include <vector>
+#include <string>
+#include <chrono>
+
+namespace base {
+
+// Sequence lifecycle states
+enum class SequenceStatus : uint8_t {
+    WAITING = 0,    // In waiting queue, not started
+    RUNNING = 1,    // Being processed (has KV cache allocated)
+    PREEMPTED = 2,  // Evicted due to memory pressure, can be rescheduled
+    FINISHED = 3,   // Generation complete (EOS or max length)
+};
+
+// Represents a single inference request/sequence
+struct Sequence {
+    // Unique identifier for this sequence
+    int32_t seq_id = -1;
+
+    // Slot index in KVCacheManager (may differ from seq_id)
+    int32_t kv_slot = -1;
+
+    // Current status
+    SequenceStatus status = SequenceStatus::WAITING;
+
+    // Input prompt tokens
+    std::vector<int32_t> prompt_tokens;
+
+    // Generated output tokens
+    std::vector<int32_t> output_tokens;
+
+    // Current position in prefill (0 = not started, prompt_len = prefill done)
+    int32_t prefill_pos = 0;
+
+    // Next token to feed (for decode phase)
+    int32_t next_token = -1;
+
+    // Request metadata
+    int32_t max_new_tokens = 128;
+    std::string original_text;
+
+    // Preemption tracking
+    int32_t num_preemptions = 0;  // How many times this sequence was preempted
+
+    // Timing (optional, for metrics)
+    std::chrono::steady_clock::time_point arrival_time;
+    std::chrono::steady_clock::time_point first_token_time;
+
+    // Total context length = prefill_pos + output_tokens.size()
+    int32_t context_len() const {
+        return prefill_pos + static_cast<int32_t>(output_tokens.size());
+    }
+
+    // Check if still in prefill phase
+    bool is_prefill() const {
+        return prefill_pos < static_cast<int32_t>(prompt_tokens.size());
+    }
+
+    // Remaining prefill tokens
+    int32_t remaining_prefill() const {
+        return static_cast<int32_t>(prompt_tokens.size()) - prefill_pos;
+    }
+
+    // Total prompt length
+    int32_t prompt_len() const {
+        return static_cast<int32_t>(prompt_tokens.size());
+    }
+
+    // Number of generated tokens
+    int32_t num_generated() const {
+        return static_cast<int32_t>(output_tokens.size());
+    }
+};
+
+// Information about a sequence scheduled for this iteration
+struct ScheduledSequence {
+    int32_t seq_id;
+    int32_t kv_slot;
+    int32_t num_tokens;      // Tokens to process this iteration
+    int32_t start_pos;       // Starting position
+    int32_t context_len;     // Context length after processing
+    bool is_prefill;         // true = prefill, false = decode
+    std::vector<int32_t> tokens;  // Actual token IDs to process
+};
+
+// Output of scheduler.schedule() call
+struct SchedulerOutput {
+    // Sequences scheduled for this iteration
+    std::vector<ScheduledSequence> scheduled_seqs;
+
+    // Sequences preempted this iteration (for logging/metrics)
+    std::vector<int32_t> preempted_seq_ids;
+
+    // Total tokens in this batch
+    int32_t num_tokens = 0;
+
+    // Breakdown
+    int32_t num_prefill_tokens = 0;
+    int32_t num_decode_tokens = 0;
+    int32_t num_prefill_seqs = 0;
+    int32_t num_decode_seqs = 0;
+
+    // True if there's work to do
+    bool has_work() const { return !scheduled_seqs.empty(); }
+
+    // True if batch contains mixed prefill + decode
+    bool is_mixed() const { return num_prefill_seqs > 0 && num_decode_seqs > 0; }
+};
+
+}  // namespace base
+
+#endif  // KUIPER_BASE_SEQUENCE_H
diff --git a/kuiper/include/model/llama3.h b/kuiper/include/model/llama3.h
index fdf8dd7..468bddd 100644
--- a/kuiper/include/model/llama3.h
+++ b/kuiper/include/model/llama3.h
@@ -43,8 +43,28 @@ class LLama2Model : public Model {
   base::Status forward(const tensor::Tensor& input, const tensor::Tensor& pos_tensor,
                        int& next) const override;
 
+  // Unified forward interface - supports all scenarios
+  base::Status forward_unified(
+      const tensor::Tensor& hidden_states,  // [num_tokens, dim]
+      const base::BatchMetadata& batch_meta,
+      std::vector<int32_t>& next_tokens) const override;
+
   op::EmbeddingOutput embedding(const std::vector<int>& tokens) const override;
 
+  // Get cuda config for external use
+  std::shared_ptr<kernel::CudaConfig> get_cuda_config() const { return cuda_config_; }
+
+  // Initialize batch buffers for multi-sequence processing
+  void init_batch_buffers(int32_t num_seqs);
+
+  // Batch forward for multi-sequence decode (legacy, calls forward_unified internally)
+  // inputs: [num_seqs, dim] - one token per sequence
+  // positions: [num_seqs] - position for each sequence
+  // outputs: [num_seqs] - next token for each sequence
+  base::Status forward_batch(const tensor::Tensor& inputs,
+                             const tensor::Tensor& positions,
+                             std::vector<int32_t>& next_tokens) const;
+
  private:
   void init_mem() override;
 
@@ -58,6 +78,9 @@ class LLama2Model : public Model {
 
   void attention_mha(int32_t layer_idx, const tensor::Tensor& pos_tensor) const;
 
+  // Paged attention MHA for decode phase
+  void attention_mha_paged(int32_t layer_idx, const tensor::Tensor& pos_tensor) const;
+
   void attention_rms(int32_t layer_idx, const tensor::Tensor& input) const;
 
   void feed_forward(int32_t layer_idx, const tensor::Tensor& input) const;
@@ -68,6 +91,9 @@ class LLama2Model : public Model {
 
   int32_t post_processing(const tensor::Tensor& pos, bool is_prompt) const override;
 
+  // Helper methods for unified forward
+  void ensure_unified_buffers(int32_t num_tokens) const;
+
  private:
   std::shared_ptr<kernel::CudaConfig> cuda_config_;
   std::unique_ptr<LLama2Layers> llama_layers_;
diff --git a/kuiper/include/model/model.h b/kuiper/include/model/model.h
index 63b8514..f2dfeb6 100644
--- a/kuiper/include/model/model.h
+++ b/kuiper/include/model/model.h
@@ -3,6 +3,9 @@
 #include <op/embedding.h>
 #include <map>
 #include <string>
+#include <mutex>
+#include "base/base.h"
+#include "base/batch_metadata.h"
 #include "config.h"
 #include "op/encode.h"
 #include "op/layer.h"
@@ -10,7 +13,8 @@
 #include "sampler/argmax_sampler.h"
 #include "sentencepiece_processor.h"
 #include "tensor/tensor.h"
-
+#include "base/kv_cache_manager.h"
+#include "base/block_allocator.h"
 namespace model {
 class Model {
  public:
@@ -19,12 +23,57 @@ class Model {
 
   virtual base::Status init(base::DeviceType device_type) = 0;
 
+  // Legacy single-token interface (deprecated, calls forward_unified internally)
   virtual base::Status predict(const tensor::Tensor& input, const tensor::Tensor& pos_tensor,
                                bool is_prompt, int& next) const = 0;
 
+  // Legacy single-token forward (deprecated, calls forward_unified internally)
   virtual base::Status forward(const tensor::Tensor& input, const tensor::Tensor& pos_tensor,
                                int& next) const = 0;
 
+  // ============================================================================
+  // Unified forward interface - supports all scenarios:
+  // - Single token decode
+  // - Batch decode (multiple sequences, one token each)
+  // - Chunked prefill (single sequence, multiple tokens)
+  // - Mixed prefill + decode (continuous batching)
+  // ============================================================================
+  virtual base::Status forward_unified(
+      const tensor::Tensor& hidden_states,  // [num_tokens, dim]
+      const base::BatchMetadata& batch_meta,
+      std::vector<int32_t>& next_tokens) const;
+
+  // Convenience methods
+  base::Status forward_decode(
+      const std::vector<int32_t>& seq_ids,
+      const std::vector<int32_t>& positions,
+      const tensor::Tensor& hidden_states,
+      std::vector<int32_t>& next_tokens) const;
+
+  base::Status forward_prefill(
+      int32_t seq_id,
+      const tensor::Tensor& hidden_states,  // [num_tokens, dim]
+      int32_t start_pos,
+      int32_t& next_token) const;
+
+  // Chunked prefill: automatically splits long sequences into chunks
+  base::Status forward_chunked_prefill(
+      int32_t seq_id,
+      const tensor::Tensor& hidden_states,  // [total_tokens, dim]
+      int32_t start_pos,
+      int32_t chunk_size,  // 0 = auto-compute
+      int32_t& next_token) const;
+
+  // Compute optimal chunk size based on available memory
+  int32_t compute_chunk_size(int32_t total_tokens, int32_t num_active_seqs = 1) const;
+
+  virtual base::Status init_paged_kv_cache(int32_t num_seqs, int32_t block_size,
+                                           int32_t max_blocks,
+                                           int32_t max_blocks_per_seq = -1);
+
+  // Initialize unified buffers for variable token counts
+  virtual void init_unified_buffers(int32_t max_tokens);
+
   base::ModelType model_type() const;
 
   const std::string& token_path() const;
@@ -54,6 +103,12 @@ class Model {
                                     const op::EmbeddingOutput& embedding_output,
                                     bool is_prompt) const;
 
+  // Get model configuration
+  const TransformerConfig* config() const { return config_.get(); }
+
+  // Get KV cache manager (for paged attention)
+  base::KVCacheManager* get_kv_cache_manager() const { return kv_cache_manager_.get(); }
+
  protected:
   virtual base::Status insert_buffer(ModelBufferType buffer_idx, const tensor::Tensor& tensor);
 
@@ -80,6 +135,12 @@ class Model {
 
  protected:
   int32_t group_size_ = 1;
+  int32_t num_seqs_ = 1;  // Number of sequences for batch processing
+  int32_t max_unified_tokens_ = 0;  // Max tokens for unified buffers
+  mutable std::mutex paged_kv_mutex_;
+  std::unique_ptr<base::BlockAllocator> block_allocator_;
+  std::unique_ptr<base::KVCacheManager> kv_cache_manager_;
+
   bool is_quant_model_ = false;
   std::unique_ptr<TransformerConfig> config_;
 
diff --git a/kuiper/include/model/request.h b/kuiper/include/model/request.h
new file mode 100644
index 0000000..ec79fba
--- /dev/null
+++ b/kuiper/include/model/request.h
@@ -0,0 +1,46 @@
+#ifndef KUIPER_INCLUDE_MODEL_REQUEST_H_
+#define KUIPER_INCLUDE_MODEL_REQUEST_H_
+
+#include <vector>
+#include <cstdint>
+
+namespace model {
+
+// Request structure for managing inference requests with paged attention
+struct Request {
+  int32_t req_id;                      // Unique request identifier
+  std::vector<int32_t> tokens;         // Token sequence (prompt + generated tokens)
+  std::vector<int32_t> block_table;    // Physical block IDs allocated for this request
+  int32_t prompt_len;                  // Length of the original prompt
+  bool is_finished;                    // Whether generation is complete
+
+  explicit Request(int32_t id, const std::vector<int32_t>& prompt_tokens)
+      : req_id(id),
+        tokens(prompt_tokens),
+        prompt_len(static_cast<int32_t>(prompt_tokens.size())),
+        is_finished(false) {}
+
+  // Get current sequence length (prompt + generated tokens)
+  int32_t get_current_seq_len() const {
+    return static_cast<int32_t>(tokens.size());
+  }
+
+  // Get number of generated tokens (excluding prompt)
+  int32_t get_generated_len() const {
+    return get_current_seq_len() - prompt_len;
+  }
+
+  // Add a newly generated token
+  void add_token(int32_t token) {
+    tokens.push_back(token);
+  }
+
+  // Mark request as finished
+  void finish() {
+    is_finished = true;
+  }
+};
+
+}  // namespace model
+
+#endif  // KUIPER_INCLUDE_MODEL_REQUEST_H_
diff --git a/kuiper/include/tensor/tensor.h b/kuiper/include/tensor/tensor.h
index 038f13c..6f591ce 100644
--- a/kuiper/include/tensor/tensor.h
+++ b/kuiper/include/tensor/tensor.h
@@ -14,7 +14,7 @@ class Tensor {
   explicit Tensor() = default;
 
   explicit Tensor(base::DataType data_type, int32_t dim0, bool need_alloc = false,
-                  std::shared_ptr<base::DeviceAllocator> alloc = nullptr, void* ptr = nullptr);
+                  std::shared_ptr<base::DeviceAllocator> alloc = nullptr,  void* ptr = nullptr);
 
   explicit Tensor(base::DataType data_type, int32_t dim0, int32_t dim1, bool need_alloc = false,
                   std::shared_ptr<base::DeviceAllocator> alloc = nullptr, void* ptr = nullptr);
@@ -30,6 +30,10 @@ class Tensor {
   explicit Tensor(base::DataType data_type, std::vector<int32_t> dims, bool need_alloc = false,
                   std::shared_ptr<base::DeviceAllocator> alloc = nullptr, void* ptr = nullptr);
 
+  // Constructor for large tensors (size > INT32_MAX)
+  explicit Tensor(base::DataType data_type, size_t size, bool need_alloc = false,
+                  std::shared_ptr<base::DeviceAllocator> alloc = nullptr, void* ptr = nullptr);
+
   void to_cpu();
 
   void to_cuda(cudaStream_t stream = nullptr);
diff --git a/kuiper/source/base/batch_metadata.cpp b/kuiper/source/base/batch_metadata.cpp
new file mode 100644
index 0000000..bc44c07
--- /dev/null
+++ b/kuiper/source/base/batch_metadata.cpp
@@ -0,0 +1,264 @@
+/**                                                                                                                                                                                                                                            
+                                                                                                                                                                 
+   * @author    jintang-coder                                                                                                                                                                                                                                                                                                                                                                                                                                                 
+   *   - BatchMetadata: Token-to-sequence mapping for GPU                                                                                                         
+   *                                                                                                                                                                                                                                    
+   */
+//  BatchMetadata 中 tokens 的拆解与映射                                                                                                                            
+                                                                                                                                                                  
+//   核心数据结构                                                                                                                                                    
+                                                                                                                                                                  
+//   struct BatchMetadata {                                                                                                                                          
+//     int32_t num_tokens;           // 总token数                                                                                                                    
+//     int32_t num_seqs;             // 序列数                                                                                                                       
+                                                                                                                                                                  
+//     // 关键映射数组 (长度 = num_tokens)                                                                                                                           
+//     tensor::Tensor seq_ids;       // [num_tokens] 每个token属于哪个序列                                                                                           
+//     tensor::Tensor positions;     // [num_tokens] 每个token在序列中的位置                                                                                         
+                                                                                                                                                                  
+//     // 序列边界 (长度 = num_seqs + 1)                                                                                                                             
+//     tensor::Tensor seq_start_locs; // [num_seqs+1] 每个序列在token数组中的起始位置                                                                                
+//   };                                                                                                                                                              
+                                                                                                                                                                  
+//   ---                                                                                                                                                             
+//   具体例子：Mixed Batch (Prefill + Decode 混合)                                                                                                                   
+                                                                                                                                                                  
+//   假设调度器输出：                                                                                                                                                
+//   Seq0 (kv_slot=0): prefill, 10 tokens, start_pos=0                                                                                                               
+//   Seq1 (kv_slot=1): decode,   1 token,  start_pos=25                                                                                                              
+//   Seq2 (kv_slot=2): prefill,  5 tokens, start_pos=0                                                                                                               
+                                                                                                                                                                  
+//   输入参数 (来自 Scheduler)                                                                                                                                       
+                                                                                                                                                                  
+//   seq_ids_in      = [0, 1, 2]        // kv_slot                                                                                                                   
+//   token_counts    = [10, 1, 5]       // 每个序列的token数                                                                                                         
+//   start_positions = [0, 25, 0]       // 每个序列的起始位置                                                                                                        
+//   context_lens    = [10, 26, 5]      // 处理后的context长度                                                                                                       
+//   is_prefill      = [true, false, true]                                                                                                                           
+                                                                                                                                                                  
+//   create_mixed() 构建过程                                                                                                                                         
+                                                                                                                                                                  
+//   // 1. 计算总token数                                                                                                                                             
+//   num_tokens = 10 + 1 + 5 = 16                                                                                                                                    
+//   num_seqs = 3                                                                                                                                                    
+                                                                                                                                                                  
+//   // 2. 构建 seq_start_locs (序列边界)                                                                                                                            
+//   seq_start_locs = [0, 10, 11, 16]                                                                                                                                
+//   //                │   │   │   └── 结束位置                                                                                                                      
+//   //                │   │   └── seq2 起始 (offset=11)                                                                                                             
+//   //                │   └── seq1 起始 (offset=10)                                                                                                                 
+//   //                └── seq0 起始 (offset=0)                                                                                                                      
+                                                                                                                                                                  
+//   // 3. 展开 seq_ids (每个token属于哪个序列)                                                                                                                      
+//   seq_ids = [0,0,0,0,0,0,0,0,0,0,  1,  2,2,2,2,2]                                                                                                                 
+//   //        └──── seq0: 10个 ────┘  │  └─ seq2: 5个 ─┘                                                                                                            
+//   //                               seq1: 1个                                                                                                                      
+                                                                                                                                                                  
+//   // 4. 展开 positions (每个token的位置)                                                                                                                          
+//   positions = [0,1,2,3,4,5,6,7,8,9,  25,  0,1,2,3,4]                                                                                                              
+//   //          └── seq0: pos 0-9 ──┘   │   └─ seq2: pos 0-4 ─┘                                                                                                     
+//   //                                 seq1: pos 25                                                                                                                 
+                                                      
+
+
+
+
+
+
+#include "base/batch_metadata.h"
+#include <cuda_runtime.h>
+#include <glog/logging.h>
+#include <algorithm>
+#include <numeric>
+
+namespace base {
+
+bool BatchMetadata::has_prefill() const {
+  for (bool is_pf : is_prefill_cpu) {
+    if (is_pf) return true;
+  }
+  return false;
+}
+
+bool BatchMetadata::has_decode() const {
+  for (bool is_pf : is_prefill_cpu) {
+    if (!is_pf) return true;
+  }
+  return false;
+}
+
+int32_t BatchMetadata::max_prefill_len() const {
+  int32_t max_len = 0;
+  for (int32_t i = 0; i < num_seqs; ++i) {
+    if (is_prefill_cpu[i]) {
+      int32_t len = seq_start_locs_cpu[i + 1] - seq_start_locs_cpu[i];
+      max_len = std::max(max_len, len);
+    }
+  }
+  return max_len;
+}
+
+int32_t BatchMetadata::max_context_len() const {
+  if (seq_lens_cpu.empty()) return 0;
+  return *std::max_element(seq_lens_cpu.begin(), seq_lens_cpu.end());
+}
+
+int32_t BatchMetadata::get_num_prefill_tokens() const {
+  int32_t count = 0;
+  for (int32_t i = 0; i < num_seqs; ++i) {
+    if (is_prefill_cpu[i]) {
+      count += seq_start_locs_cpu[i + 1] - seq_start_locs_cpu[i];
+    }
+  }
+  return count;
+}
+
+int32_t BatchMetadata::get_num_decode_tokens() const {
+  int32_t count = 0;
+  for (int32_t i = 0; i < num_seqs; ++i) {
+    if (!is_prefill_cpu[i]) {
+      count += seq_start_locs_cpu[i + 1] - seq_start_locs_cpu[i];
+    }
+  }
+  return count;
+}
+
+std::vector<int32_t> BatchMetadata::get_last_token_indices() const {
+  std::vector<int32_t> indices(num_seqs);
+  for (int32_t i = 0; i < num_seqs; ++i) {
+    indices[i] = seq_start_locs_cpu[i + 1] - 1;
+  }
+  return indices;
+}
+
+void BatchMetadata::sync_to_device(std::shared_ptr<DeviceAllocator> alloc_gpu,
+                                   std::shared_ptr<DeviceAllocator> alloc_cpu,
+                                   void* stream) {
+  // Allocate GPU tensors
+  seq_ids = tensor::Tensor(DataType::kDataTypeInt32, num_tokens, true, alloc_gpu);
+  positions = tensor::Tensor(DataType::kDataTypeInt32, num_tokens, true, alloc_gpu);
+  seq_start_locs = tensor::Tensor(DataType::kDataTypeInt32, num_seqs + 1, true, alloc_gpu);
+
+  seq_ids.set_device_type(DeviceType::kDeviceCUDA);
+  positions.set_device_type(DeviceType::kDeviceCUDA);
+  seq_start_locs.set_device_type(DeviceType::kDeviceCUDA);
+
+  // Copy data to GPU
+  cudaStream_t cuda_stream = static_cast<cudaStream_t>(stream);
+
+  cudaMemcpyAsync(seq_ids.ptr<int32_t>(), seq_ids_cpu.data(),
+                  num_tokens * sizeof(int32_t), cudaMemcpyHostToDevice, cuda_stream);
+
+  cudaMemcpyAsync(positions.ptr<int32_t>(), positions_cpu.data(),
+                  num_tokens * sizeof(int32_t), cudaMemcpyHostToDevice, cuda_stream);
+
+  cudaMemcpyAsync(seq_start_locs.ptr<int32_t>(), seq_start_locs_cpu.data(),
+                  (num_seqs + 1) * sizeof(int32_t), cudaMemcpyHostToDevice, cuda_stream);
+}
+
+BatchMetadata BatchMetadata::create_decode(
+    const std::vector<int32_t>& seq_ids_in,
+    const std::vector<int32_t>& positions_in,
+    const std::vector<int32_t>& context_lens,
+    std::shared_ptr<DeviceAllocator> alloc_gpu,
+    std::shared_ptr<DeviceAllocator> alloc_cpu,
+    void* stream) {
+  CHECK_EQ(seq_ids_in.size(), positions_in.size());
+  CHECK_EQ(seq_ids_in.size(), context_lens.size());
+
+  BatchMetadata meta;
+  meta.num_tokens = static_cast<int32_t>(seq_ids_in.size());
+  meta.num_seqs = static_cast<int32_t>(seq_ids_in.size());
+
+  // For decode, each sequence has exactly 1 token
+  meta.seq_ids_cpu = seq_ids_in;
+  meta.positions_cpu = positions_in;
+  meta.seq_lens_cpu = context_lens;
+
+  // Build seq_start_locs: [0, 1, 2, ..., num_seqs]
+  meta.seq_start_locs_cpu.resize(meta.num_seqs + 1);
+  for (int32_t i = 0; i <= meta.num_seqs; ++i) {
+    meta.seq_start_locs_cpu[i] = i;
+  }
+
+  // All sequences are in decode phase
+  meta.is_prefill_cpu.resize(meta.num_seqs, false);
+
+  meta.sync_to_device(alloc_gpu, alloc_cpu, stream);
+  return meta;
+}
+
+BatchMetadata BatchMetadata::create_prefill(
+    int32_t seq_id,
+    int32_t start_pos,
+    int32_t num_tokens,
+    int32_t context_len,
+    std::shared_ptr<DeviceAllocator> alloc_gpu,
+    std::shared_ptr<DeviceAllocator> alloc_cpu,
+    void* stream) {
+  CHECK_GT(num_tokens, 0);
+
+  BatchMetadata meta;
+  meta.num_tokens = num_tokens;
+  meta.num_seqs = 1;
+
+  // All tokens belong to the same sequence
+  meta.seq_ids_cpu.resize(num_tokens, seq_id);
+
+  // Positions: [start_pos, start_pos+1, ..., start_pos+num_tokens-1]
+  meta.positions_cpu.resize(num_tokens);
+  for (int32_t i = 0; i < num_tokens; ++i) {
+    meta.positions_cpu[i] = start_pos + i;
+  }
+
+  meta.seq_start_locs_cpu = {0, num_tokens};
+  meta.seq_lens_cpu = {context_len};
+  meta.is_prefill_cpu = {true};
+
+  meta.sync_to_device(alloc_gpu, alloc_cpu, stream);
+  return meta;
+}
+
+BatchMetadata BatchMetadata::create_mixed(
+    const std::vector<int32_t>& seq_ids_in,
+    const std::vector<int32_t>& token_counts,
+    const std::vector<int32_t>& start_positions,
+    const std::vector<int32_t>& context_lens,
+    const std::vector<bool>& is_prefill,
+    std::shared_ptr<DeviceAllocator> alloc_gpu,
+    std::shared_ptr<DeviceAllocator> alloc_cpu,
+    void* stream) {
+  int32_t num_seqs = static_cast<int32_t>(seq_ids_in.size());
+  CHECK_EQ(token_counts.size(), num_seqs);
+  CHECK_EQ(start_positions.size(), num_seqs);
+  CHECK_EQ(context_lens.size(), num_seqs);
+  CHECK_EQ(is_prefill.size(), num_seqs);
+
+  BatchMetadata meta;
+  meta.num_seqs = num_seqs;
+  meta.num_tokens = std::accumulate(token_counts.begin(), token_counts.end(), 0);
+
+  meta.seq_ids_cpu.reserve(meta.num_tokens);
+  meta.positions_cpu.reserve(meta.num_tokens);
+  meta.seq_start_locs_cpu.resize(num_seqs + 1);
+  meta.seq_lens_cpu = context_lens;
+  meta.is_prefill_cpu = is_prefill;
+
+  int32_t offset = 0;
+  for (int32_t i = 0; i < num_seqs; ++i) {
+    meta.seq_start_locs_cpu[i] = offset;
+
+    for (int32_t t = 0; t < token_counts[i]; ++t) {
+      meta.seq_ids_cpu.push_back(seq_ids_in[i]);
+      meta.positions_cpu.push_back(start_positions[i] + t);
+    }
+
+    offset += token_counts[i];
+  }
+  meta.seq_start_locs_cpu[num_seqs] = offset;
+
+  meta.sync_to_device(alloc_gpu, alloc_cpu, stream);
+  return meta;
+}
+
+}  // namespace base
\ No newline at end of file
diff --git a/kuiper/source/base/kv_cache_manager.cpp b/kuiper/source/base/kv_cache_manager.cpp
new file mode 100644
index 0000000..1695c2f
--- /dev/null
+++ b/kuiper/source/base/kv_cache_manager.cpp
@@ -0,0 +1,546 @@
+/**
+ * @author    jintang-coder
+ *   - KVCacheManager: KV Cache memory and sequence slot management
+ *
+ * =============================================================================
+ * KVCacheManager KV缓存管理器
+ * =============================================================================
+ *
+ * 功能: 管理序列槽位(slot)分配 + 物理块(block)映射 + KV Cache读写
+ *
+ * 核心数据结构:
+ *   class KVCacheManager {
+ *       tensor::Tensor kv_cache_dev_;       // GPU上的KV缓存池
+ *       tensor::Tensor block_tables_host_;  // [num_seqs, max_blocks_per_seq] 逻辑块→物理块映射
+ *       tensor::Tensor context_lens_host_;  // [num_seqs] 每个序列的当前长度
+ *       std::vector<int32_t> free_seq_slots_; // 空闲序列槽位
+ *       BlockAllocator block_alloc_;        // 物理块分配器
+ *   };
+ *
+ * ---
+ * 具体例子: 3个序列的完整生命周期
+ *
+ * 配置:
+ *   num_seqs = 4, block_size = 16, max_blocks = 100
+ *
+ * 初始状态:
+ *   free_seq_slots_ = [3, 2, 1, 0]
+ *   block_tables_ = [[-1,-1,...], [-1,-1,...], [-1,-1,...], [-1,-1,...]]
+ *   context_lens_ = [0, 0, 0, 0]
+ *
+ * ---
+ * 步骤1: 分配序列槽位
+ *
+ *   int slot = kv_manager->allocate_sequence_slot();
+ *   // slot = 0
+ *   // free_seq_slots_ = [3, 2, 1]
+ *
+ * ---
+ * 步骤2: 为Prefill分配blocks (假设prompt有50个tokens)
+ *
+ *   kv_manager->allocate_blocks_for_tokens(slot=0, num_tokens=50);
+ *   // 需要 ceil(50/16) = 4 个blocks
+ *   // 调用 block_alloc_->allocate() 4次，获得 blocks [0,1,2,3]
+ *
+ *   block_tables_[0] = [0, 1, 2, 3, -1, -1, ...]
+ *   //                  │  │  │  │
+ *   //                  │  │  │  └─ logical_block_3 → physical_block_3
+ *   //                  │  │  └─ logical_block_2 → physical_block_2
+ *   //                  │  └─ logical_block_1 → physical_block_1
+ *   //                  └─ logical_block_0 → physical_block_0
+ *
+ *   context_lens_[0] = 50
+ *
+ * ---
+ * 步骤3: Decode阶段 - 生成新token
+ *
+ *   // token位置从50增长到64，仍在block_3内 (16*4=64)
+ *   // 无需新块
+ *
+ *   // token位置增长到65，需要新块
+ *   kv_manager->allocate_blocks_for_tokens(slot=0, num_tokens=65);
+ *   // logical_block_4 需要分配 → physical_block_4
+ *   block_tables_[0] = [0, 1, 2, 3, 4, -1, ...]
+ *
+ * ---
+ * 步骤4: 序列完成 - 释放资源
+ *
+ *   kv_manager->release_sequence_slot(slot=0);
+ *   // 1. 收集 blocks [0,1,2,3,4]
+ *   // 2. 调用 block_alloc_->free([0,1,2,3,4])
+ *   // 3. block_tables_[0] = [-1,-1,-1,...]
+ *   // 4. context_lens_[0] = 0
+ *   // 5. free_seq_slots_.push_back(0) → [3,2,1,0]
+ *
+ * ---
+ * KV Cache 内存布局:
+ *
+ *   kv_cache_dev_ shape: [max_blocks, num_layers, 2, block_size, kv_dim]
+ *
+ *   访问 seq0, layer2, token位置35 的 Key:
+ *     logical_block = 35 / 16 = 2
+ *     token_in_block = 35 % 16 = 3
+ *     physical_block = block_tables_[0][2] = 2
+ *
+ *     offset = physical_block * (num_layers * 2 * block_size * kv_dim)
+ *            + layer_idx * (2 * block_size * kv_dim)
+ *            + 0 * (block_size * kv_dim)  // K plane
+ *            + token_in_block * kv_dim
+ *
+ * ---
+ * 与Scheduler的交互:
+ *
+ *   Scheduler                         KVCacheManager
+ *      │                                    │
+ *      │ admit_sequence()                   │
+ *      │   └─> allocate_sequence_slot() ───►│ 返回 slot
+ *      │   └─> allocate_blocks_for_tokens()►│ 分配物理块
+ *      │                                    │
+ *      │ finish_sequence()                  │
+ *      │   └─> release_sequence_slot() ────►│ 释放slot+blocks
+ *      │                                    │
+ *
+ */
+
+#include "base/kv_cache_manager.h"
+#include <glog/logging.h>
+#include <cuda_runtime.h>
+#include "tensor/tensor.h"
+
+namespace base {
+
+base::Status KVCacheManager::init_paged_token_major(
+    base::DeviceType device_type,
+    std::shared_ptr<base::DeviceAllocator> alloc_dev,
+    std::shared_ptr<base::DeviceAllocator> alloc_cpu,
+    int32_t num_seqs,
+    int32_t num_layers,
+    int32_t num_kv_heads,
+    int32_t head_dim,
+    int32_t block_size,
+    int32_t max_blocks,
+    int32_t max_blocks_per_seq,
+    void* stream) {
+
+  std::lock_guard<std::mutex> lock(mu_);
+
+  device_type_ = device_type;
+  num_seqs_ = num_seqs;
+  num_layers_ = num_layers;
+  num_kv_heads_ = num_kv_heads;
+  head_dim_ = head_dim;
+  kv_dim_ = num_kv_heads * head_dim;
+  block_size_ = block_size;
+  max_blocks_ = max_blocks;
+  max_blocks_per_seq_ = max_blocks_per_seq;
+
+  alloc_dev_ = alloc_dev;
+  alloc_cpu_ = alloc_cpu;
+
+  // Initialize block allocator
+  block_alloc_ = std::make_unique<BlockAllocator> (max_blocks, block_size);
+
+  // Allocate KV cache on device
+  // Layout: [max_blocks, num_layers, 2 (K+V), block_size, kv_dim]
+  size_t kv_cache_size = static_cast<size_t>(max_blocks) * num_layers * 2 * block_size * kv_dim_;
+  kv_cache_dev_ = tensor::Tensor(base::DataType::kDataTypeFp32, kv_cache_size);
+  kv_cache_dev_.allocate(alloc_dev_);
+  kv_cache_dev_.set_device_type(device_type);
+
+  // Initialize to zero
+  if (device_type == base::DeviceType::kDeviceCUDA) {
+    cudaMemsetAsync(kv_cache_dev_.ptr<void>(), 0, kv_cache_size * sizeof(float),
+                    static_cast<cudaStream_t>(stream));
+  } else {
+    memset(kv_cache_dev_.ptr<void>(), 0, kv_cache_size * sizeof(float));
+  }
+
+  // Allocate block tables on host and device
+  // Layout: [num_seqs, max_blocks_per_seq]
+  int32_t block_table_size = num_seqs * max_blocks_per_seq;
+  block_tables_host_ = tensor::Tensor(base::DataType::kDataTypeInt32, block_table_size);
+  block_tables_host_.allocate(alloc_cpu_);
+  block_tables_host_.set_device_type(base::DeviceType::kDeviceCPU);
+
+  block_tables_dev_ = tensor::Tensor(base::DataType::kDataTypeInt32, block_table_size);
+  block_tables_dev_.allocate(alloc_dev_);
+  block_tables_dev_.set_device_type(device_type);
+
+  // Initialize block tables to -1 (invalid)
+  int32_t* block_table_ptr = block_tables_host_.ptr<int32_t>();
+  for (int32_t i = 0; i < block_table_size; ++i) {
+    block_table_ptr[i] = -1;
+  }
+
+  // Allocate context lengths on host and device
+  // Layout: [num_seqs]
+  context_lens_host_ = tensor::Tensor(base::DataType::kDataTypeInt32, num_seqs);
+  context_lens_host_.allocate(alloc_cpu_);
+  context_lens_host_.set_device_type(base::DeviceType::kDeviceCPU);
+
+  context_lens_dev_ = tensor::Tensor(base::DataType::kDataTypeInt32, num_seqs);
+  context_lens_dev_.allocate(alloc_dev_);
+  context_lens_dev_.set_device_type(device_type);
+  // Initialize context lengths to 0
+  int32_t* context_lens_ptr = context_lens_host_.ptr<int32_t>();
+  for (int32_t i = 0; i < num_seqs; ++i) {
+    context_lens_ptr[i] = 0;
+  }
+
+  // Initialize free sequence slots (reverse order so we allocate 0, 1, 2, ...)
+  free_seq_slots_.clear();
+  free_seq_slots_.reserve(num_seqs);
+  for (int32_t i = num_seqs - 1; i >= 0; --i) {
+    free_seq_slots_.push_back(i);
+  }
+
+  tables_dirty_ = true;
+
+  return base::error::Success();
+}
+
+std::pair<tensor::Tensor, tensor::Tensor> KVCacheManager::slice_kv(
+    int32_t seq_idx, int32_t layer_idx, int32_t token_pos) {
+
+  std::lock_guard<std::mutex> lock(mu_);
+
+  CHECK_LT(seq_idx, num_seqs_) << "Sequence index out of bounds";
+  CHECK_LT(layer_idx, num_layers_) << "Layer index out of bounds";
+
+  // Determine which logical block this token belongs to
+  int32_t logical_block = token_pos / block_size_;
+  int32_t token_in_block = token_pos % block_size_;
+
+  // Get or allocate physical block
+  int32_t physical_block_id = ensure_physical_block_(seq_idx, logical_block);
+  CHECK_GE(physical_block_id, 0) << "Failed to allocate physical block";
+
+  // Update context length if needed
+  int32_t* context_lens_ptr = context_lens_host_.ptr<int32_t>();
+  if (token_pos >= context_lens_ptr[seq_idx]) {
+    context_lens_ptr[seq_idx] = token_pos + 1;
+    tables_dirty_ = true;
+  }
+
+  // Calculate pointers to K and V
+ float* key_ptr = key_ptr_(physical_block_id, layer_idx, token_in_block);
+ float* value_ptr = value_ptr_(physical_block_id, layer_idx, token_in_block);
+
+  // Create tensor views (non-owning)
+  tensor::Tensor key(base::DataType::kDataTypeFp32, kv_dim_, false, nullptr, key_ptr);
+  tensor::Tensor val(base::DataType::kDataTypeFp32, kv_dim_, false, nullptr, value_ptr);
+
+  key.set_device_type(device_type_);
+  val.set_device_type(device_type_);
+
+  return {key, val};
+}
+
+const tensor::Tensor& KVCacheManager::kv_cache_device() const {
+  return kv_cache_dev_;
+}
+
+const tensor::Tensor& KVCacheManager::block_tables_device() {
+  std::lock_guard<std::mutex> lock(mu_);
+  if (tables_dirty_) {
+    sync_tables_to_device_unlocked_(nullptr);
+  }
+  return block_tables_dev_;
+}
+
+const tensor::Tensor& KVCacheManager::context_lens_device() {
+  std::lock_guard<std::mutex> lock(mu_);
+  if (tables_dirty_) {
+    sync_tables_to_device_unlocked_(nullptr);
+  }
+  return context_lens_dev_;
+}
+
+const tensor::Tensor& KVCacheManager::block_tables_host() const {
+  return block_tables_host_;
+}
+
+const tensor::Tensor& KVCacheManager::context_lens_host() const {
+  return context_lens_host_;
+}
+
+void KVCacheManager::sync_tables_to_device(void* stream) {
+  std::lock_guard<std::mutex> lock(mu_);
+  sync_tables_to_device_unlocked_(stream);
+}
+
+void KVCacheManager::sync_tables_to_device_unlocked_(void* stream) {
+  if (!tables_dirty_) {
+    return;
+  }
+
+
+  // Copy block tables
+  if (device_type_ == base::DeviceType::kDeviceCUDA) {
+    cudaMemcpyAsync(block_tables_dev_.ptr<void>(),
+                    block_tables_host_.ptr<void>(),
+                    block_tables_host_.size() * sizeof(int32_t),
+                    cudaMemcpyHostToDevice,
+                    static_cast<cudaStream_t>(stream));
+
+    cudaMemcpyAsync(context_lens_dev_.ptr<void>(),
+                    context_lens_host_.ptr<void>(),
+                    context_lens_host_.size() * sizeof(int32_t),
+                    cudaMemcpyHostToDevice,
+                    static_cast<cudaStream_t>(stream));
+  } else {
+    memcpy(block_tables_dev_.ptr<void>(),
+           block_tables_host_.ptr<void>(),
+           block_tables_host_.size() * sizeof(int32_t));
+
+    memcpy(context_lens_dev_.ptr<void>(),
+           context_lens_host_.ptr<void>(),
+           context_lens_host_.size() * sizeof(int32_t));
+  }
+
+  tables_dirty_ = false;
+}
+
+void KVCacheManager::reset_sequence(int32_t seq_idx) {
+  std::lock_guard<std::mutex> lock(mu_);
+
+  CHECK_LT(seq_idx, num_seqs_) << "Sequence index out of bounds";
+
+  // Free all blocks for this sequence
+  std::vector<int32_t> blocks_to_free;
+  int32_t* block_table_ptr = block_tables_host_.ptr<int32_t>();
+
+  for (int32_t i = 0; i < max_blocks_per_seq_; ++i) {
+    int32_t block_id = block_table_ptr[seq_idx * max_blocks_per_seq_ + i];
+    if (block_id >= 0) {
+      blocks_to_free.push_back(block_id);
+      block_table_ptr[seq_idx * max_blocks_per_seq_ + i] = -1;
+    }
+  }
+
+  if (!blocks_to_free.empty()) {
+    block_alloc_->free(blocks_to_free);
+  }
+
+  // Reset context length
+  int32_t* context_lens_ptr = context_lens_host_.ptr<int32_t>();
+  context_lens_ptr[seq_idx] = 0;
+
+  tables_dirty_ = true;
+}
+
+void KVCacheManager::reset_all() {
+  std::lock_guard<std::mutex> lock(mu_);
+
+  // Reset all sequences
+  int32_t* block_table_ptr = block_tables_host_.ptr<int32_t>();
+  for (int32_t i = 0; i < num_seqs_ * max_blocks_per_seq_; ++i) {
+    block_table_ptr[i] = -1;
+  }
+
+  int32_t* context_lens_ptr = context_lens_host_.ptr<int32_t>();
+  for (int32_t i = 0; i < num_seqs_; ++i) {
+    context_lens_ptr[i] = 0;
+  }
+
+  // Reinitialize block allocator
+  block_alloc_ = std::make_unique<BlockAllocator>(max_blocks_, block_size_);
+
+  tables_dirty_ = true;
+}
+
+int32_t KVCacheManager::block_size() const {
+  return block_size_;
+}
+
+int32_t KVCacheManager::max_blocks_per_seq() const {
+  return max_blocks_per_seq_;
+}
+
+int32_t KVCacheManager::kv_dim() const {
+  return kv_dim_;
+}
+
+int32_t KVCacheManager::num_layers() const {
+  return num_layers_;
+}
+
+int32_t KVCacheManager::num_kv_heads() const {
+  return num_kv_heads_;
+}
+
+int32_t KVCacheManager::head_dim() const {
+  return head_dim_;
+}
+
+int32_t KVCacheManager::num_seqs() const {
+  return num_seqs_;
+}
+
+int32_t KVCacheManager::max_blocks() const {
+  return max_blocks_;
+}
+
+int32_t KVCacheManager::get_context_len(int32_t seq_idx) const {
+  std::lock_guard<std::mutex> lock(mu_);
+  CHECK_LT(seq_idx, num_seqs_) << "Sequence index out of bounds";
+  return context_lens_host_.ptr<int32_t>()[seq_idx];
+}
+
+void KVCacheManager::set_context_len(int32_t seq_idx, int32_t context_len) {
+  std::lock_guard<std::mutex> lock(mu_);
+  CHECK_LT(seq_idx, num_seqs_) << "Sequence index out of bounds";
+  CHECK_GE(context_len, 0) << "Context length must be non-negative";
+
+  int32_t* context_lens_ptr = context_lens_host_.ptr<int32_t>();
+  context_lens_ptr[seq_idx] = context_len;
+  tables_dirty_ = true;
+}
+
+bool KVCacheManager::allocate_blocks_for_tokens(int32_t seq_idx, int32_t num_tokens) {
+  std::lock_guard<std::mutex> lock(mu_);
+  CHECK_LT(seq_idx, num_seqs_) << "Sequence index out of bounds";
+
+  if (num_tokens <= 0) {
+    return true;
+  }
+
+  // Calculate how many blocks we need
+  int32_t num_blocks_needed = (num_tokens + block_size_ - 1) / block_size_;
+
+  if (num_blocks_needed > max_blocks_per_seq_) {
+    LOG(ERROR) << "Requested " << num_blocks_needed << " blocks, but max_blocks_per_seq is "
+               << max_blocks_per_seq_;
+    return false;
+  }
+
+  // Check current allocation and allocate missing blocks
+  int32_t* block_table_ptr = block_tables_host_.ptr<int32_t>();
+
+  for (int32_t logical_block = 0; logical_block < num_blocks_needed; ++logical_block) {
+    int32_t table_idx = seq_idx * max_blocks_per_seq_ + logical_block;
+    if (block_table_ptr[table_idx] < 0) {
+      // Need to allocate this block
+      int32_t physical_block_id = block_alloc_->allocate();
+      if (physical_block_id < 0) {
+        LOG(ERROR) << "Out of memory: cannot allocate block " << logical_block;
+        return false;
+      }
+      block_table_ptr[table_idx] = physical_block_id;
+      tables_dirty_ = true;
+    }
+  }
+
+  // Update context length
+  int32_t* context_lens_ptr = context_lens_host_.ptr<int32_t>();
+  if (num_tokens > context_lens_ptr[seq_idx]) {
+    context_lens_ptr[seq_idx] = num_tokens;
+    tables_dirty_ = true;
+  }
+
+  return true;
+}
+
+
+// Private helper methods
+
+ float* KVCacheManager::key_ptr_(int32_t physical_block_id, int32_t layer_idx,
+                                int32_t token_in_block)  {
+  // Layout: [max_blocks, num_layers, 2 (K+V), block_size, kv_dim]
+  size_t offset = static_cast<size_t>(physical_block_id) * num_layers_ * 2 * block_size_ * kv_dim_
+                + static_cast<size_t>(layer_idx) * 2 * block_size_ * kv_dim_
+                + 0 * block_size_ * kv_dim_  // K plane (index 0)
+                + static_cast<size_t>(token_in_block) * kv_dim_;
+
+  return (kv_cache_dev_.ptr<float>()) + offset;
+}
+
+ float* KVCacheManager::value_ptr_(int32_t physical_block_id, int32_t layer_idx,
+                                  int32_t token_in_block)  {
+  // Layout: [max_blocks, num_layers, 2 (K+V), block_size, kv_dim]
+  size_t offset = static_cast<size_t>(physical_block_id) * num_layers_ * 2 * block_size_ * kv_dim_
+                + static_cast<size_t>(layer_idx) * 2 * block_size_ * kv_dim_
+                + 1 * block_size_ * kv_dim_  // V plane (index 1)
+                + static_cast<size_t>(token_in_block) * kv_dim_;
+
+  return  (kv_cache_dev_.ptr<float>()) + offset;
+}
+
+int32_t KVCacheManager::ensure_physical_block_(int32_t seq_idx, int32_t logical_block) {
+  CHECK_LT(logical_block, max_blocks_per_seq_) << "Logical block index out of bounds";
+
+  int32_t* block_table_ptr = block_tables_host_.ptr<int32_t>();
+  int32_t table_idx = seq_idx * max_blocks_per_seq_ + logical_block;
+
+  int32_t physical_block_id = block_table_ptr[table_idx];
+
+  // If not allocated, allocate a new block
+  if (physical_block_id < 0) {
+    physical_block_id = block_alloc_->allocate();
+    CHECK_GE(physical_block_id, 0) << "Out of memory: cannot allocate new block";
+
+    block_table_ptr[table_idx] = physical_block_id;
+    tables_dirty_ = true;
+  }
+
+  return physical_block_id;
+}
+
+int32_t KVCacheManager::allocate_sequence_slot() {
+  std::lock_guard<std::mutex> lock(mu_);
+  if (free_seq_slots_.empty()) {
+    return -1;
+  }
+  int32_t slot = free_seq_slots_.back();
+  free_seq_slots_.pop_back();
+  return slot;
+}
+
+void KVCacheManager::release_sequence_slot(int32_t seq_idx) {
+  std::lock_guard<std::mutex> lock(mu_);
+  if (seq_idx < 0 || seq_idx >= num_seqs_) {
+    return;
+  }
+
+  // Free all blocks for this sequence
+  std::vector<int32_t> blocks_to_free;
+  int32_t* block_table_ptr = block_tables_host_.ptr<int32_t>();
+
+  for (int32_t i = 0; i < max_blocks_per_seq_; ++i) {
+    int32_t block_id = block_table_ptr[seq_idx * max_blocks_per_seq_ + i];
+    if (block_id >= 0) {
+      blocks_to_free.push_back(block_id);
+      block_table_ptr[seq_idx * max_blocks_per_seq_ + i] = -1;
+    }
+  }
+
+  if (!blocks_to_free.empty()) {
+    block_alloc_->free(blocks_to_free);
+  }
+
+  // Reset context length
+  int32_t* context_lens_ptr = context_lens_host_.ptr<int32_t>();
+  context_lens_ptr[seq_idx] = 0;
+
+  tables_dirty_ = true;
+
+  // Return slot to free pool
+  free_seq_slots_.push_back(seq_idx);
+}
+
+bool KVCacheManager::has_free_slot() const {
+  std::lock_guard<std::mutex> lock(mu_);
+  return !free_seq_slots_.empty();
+}
+
+int32_t KVCacheManager::num_free_slots() const {
+  std::lock_guard<std::mutex> lock(mu_);
+  return static_cast<int32_t>(free_seq_slots_.size());
+}
+
+int32_t KVCacheManager::num_free_blocks() const {
+  std::lock_guard<std::mutex> lock(mu_);
+  return block_alloc_->get_free_block_num();
+}
+
+}  // namespace base
diff --git a/kuiper/source/base/scheduler.cpp b/kuiper/source/base/scheduler.cpp
new file mode 100644
index 0000000..41b8e8d
--- /dev/null
+++ b/kuiper/source/base/scheduler.cpp
@@ -0,0 +1,738 @@
+/**
+ * @author    jintang-coder
+ *   - Scheduler: Request scheduling and continuous batching
+ *
+ * =============================================================================
+ * Scheduler 调度器
+ * =============================================================================
+ *
+ * 功能: 管理推理请求队列，决定每次迭代处理哪些序列的哪些tokens
+ *
+ * 核心数据结构:
+ *   class Scheduler {
+ *       std::deque<int32_t> waiting_queue_;   // 等待队列 (新请求)
+ *       std::deque<int32_t> running_queue_;   // 运行队列 (已分配KV slot)
+ *       std::vector<int32_t> finished_seqs_;  // 已完成序列
+ *       std::unordered_map<int32_t, Sequence> sequences_; // 序列详情
+ *       KVCacheManager* kv_manager_;          // KV缓存管理器
+ *   };
+ *
+ * ---
+ * 具体例子: 3个请求的调度过程
+ *
+ * 请求到达:
+ *   Seq0: "Hello world" (10 tokens), max_new_tokens=50
+ *   Seq1: "Once upon a time" (20 tokens), max_new_tokens=50
+ *   Seq2: "The quick brown fox" (15 tokens), max_new_tokens=50
+ *
+ * 配置:
+ *   max_num_batched_tokens = 32
+ *   max_prefill_tokens = 16 (chunked prefill)
+ *   decode_priority = true
+ *
+ * ---
+ * 迭代1: schedule() - 首次调度
+ *
+ *   waiting_queue_ = [0, 1, 2]
+ *   running_queue_ = []
+ *
+ *   1. schedule_decode() - 无decode序列，跳过
+ *   2. schedule_prefill() - 无running序列，跳过
+ *   3. try_admit_waiting() - 从waiting移入running
+ *
+ *   remaining_tokens = 32
+ *
+ *   admit_sequence(seq0):
+ *     kv_slot = kv_manager_->allocate_sequence_slot() → 0
+ *     chunk_size = min(32, 16, 10) = 10
+ *     remaining_tokens = 32 - 10 = 22
+ *
+ *   admit_sequence(seq1):
+ *     kv_slot = kv_manager_->allocate_sequence_slot() → 1
+ *     chunk_size = min(22, 16, 20) = 16
+ *     remaining_tokens = 22 - 16 = 6
+ *
+ *   admit_sequence(seq2):
+ *     kv_slot = kv_manager_->allocate_sequence_slot() → 2
+ *     chunk_size = min(6, 16, 15) = 6  // 用完剩余token额度
+ *     remaining_tokens = 6 - 6 = 0
+ *
+ *   输出 SchedulerOutput:
+ *     scheduled_seqs = [
+ *       {seq_id=0, kv_slot=0, num_tokens=10, start_pos=0, is_prefill=true},
+ *       {seq_id=1, kv_slot=1, num_tokens=16, start_pos=0, is_prefill=true},
+ *       {seq_id=2, kv_slot=2, num_tokens=6,  start_pos=0, is_prefill=true},
+ *     ]
+ *     num_tokens = 32, num_prefill_tokens = 32
+ *
+ *   状态变化:
+ *     waiting_queue_ = []  // 全部admit
+ *     running_queue_ = [0, 1, 2]
+ *     seq0.prefill_pos = 0 → 10 (prefill完成)
+ *     seq1.prefill_pos = 0 → 16 (部分prefill, 还剩4个)
+ *     seq2.prefill_pos = 0 → 6  (部分prefill, 还剩9个)
+ *
+ * ---
+ * 迭代2: schedule() - 混合批次 (Prefill续 + Decode)
+ *
+ *   running_queue_ = [0, 1, 2]
+ *   remaining_tokens = 32
+ *
+ *   1. schedule_decode() - seq0已完成prefill，加入decode
+ *      {seq_id=0, num_tokens=1, start_pos=10, is_prefill=false}
+ *      remaining_tokens = 32 - 1 = 31
+ *
+ *   2. schedule_prefill() - seq1还有4个token, seq2还有9个token
+ *      seq1: chunk_size = min(31, 16, 4) = 4
+ *      {seq_id=1, num_tokens=4, start_pos=16, is_prefill=true}
+ *      remaining_tokens = 31 - 4 = 27
+ *
+ *      seq2: chunk_size = min(27, 16, 9) = 9
+ *      {seq_id=2, num_tokens=9, start_pos=6, is_prefill=true}
+ *      remaining_tokens = 27 - 9 = 18
+ *
+ *   3. try_admit_waiting() - waiting为空，跳过
+ *
+ *   输出 SchedulerOutput:
+ *     scheduled_seqs = [
+ *       {seq_id=0, kv_slot=0, num_tokens=1, is_prefill=false}, // decode
+ *       {seq_id=1, kv_slot=1, num_tokens=4, is_prefill=true},  // prefill完成
+ *       {seq_id=2, kv_slot=2, num_tokens=9, is_prefill=true},  // prefill完成
+ *     ]
+ *     num_tokens = 14
+ *     num_decode_tokens = 1, num_prefill_tokens = 13
+ *
+ *   状态变化:
+ *     seq0.prefill_pos = 10 (已完成), 进入decode
+ *     seq1.prefill_pos = 16 → 20 (prefill完成)
+ *     seq2.prefill_pos = 6 → 15 (prefill完成)
+ *
+ * ---
+ * 迭代N: 全部进入decode阶段
+ *
+ *   所有序列prefill完成，每次迭代:
+ *     scheduled_seqs = [
+ *       {seq_id=0, num_tokens=1, is_prefill=false},
+ *       {seq_id=1, num_tokens=1, is_prefill=false},
+ *       {seq_id=2, num_tokens=1, is_prefill=false},
+ *     ]
+ *     num_tokens = 3, num_decode_tokens = 3
+ *
+ * ---
+ * 序列完成: finish_sequence()
+ *
+ *   // 检测到EOS或达到max_new_tokens
+ *   scheduler.finish_sequence(seq_id=0);
+ *
+ *   1. seq0.status = FINISHED
+ *   2. running_queue_.erase(0) → [1, 2]
+ *   3. kv_manager_->release_sequence_slot(kv_slot=0)
+ *   4. finished_seqs_.push_back(0)
+ *
+ * ---
+ * 调度策略 (decode_priority=true):
+ *
+ *   优先级: Decode > Prefill续 > 新Prefill
+ *
+ *   原因: Decode只需1个token，延迟敏感
+ *         Prefill可以分chunk，吞吐优先
+ *
+ *   schedule() 执行顺序:
+ *     1. schedule_decode()      - 先处理所有decode序列
+ *     2. schedule_prefill()     - 再处理running中的prefill
+ *     3. try_admit_waiting()    - 最后admit新序列
+ *
+ * ---
+ * 与其他组件的关系:
+ *
+ *   用户请求
+ *      │
+ *      ▼
+ *   Scheduler.add_request()
+ *      │
+ *      ▼
+ *   Scheduler.schedule() ──────► SchedulerOutput
+ *      │                              │
+ *      │ admit_sequence()             │ scheduled_seqs
+ *      ▼                              ▼
+ *   KVCacheManager              BatchMetadata.create_mixed()
+ *      │                              │
+ *      │ allocate_sequence_slot()     │ 构建GPU映射数组
+ *      │ allocate_blocks_for_tokens() │
+ *      ▼                              ▼
+ *   BlockAllocator              GPU Kernels
+ *
+ * 
+ *  最终驱逐策略                                                                                                                                                    
+                                                                                                                                                                  
+  优先级 (从高到低驱逐):                                                                                                                                          
+  ┌─────────────────────────────────────────────────────────────┐                                                                                                 
+  │ 1. Prefill 阶段序列 - 按 context_len 从小到大               │                                                                                                 
+  │    (还没产出，用户影响最小)                                  │                                                                                                
+  ├─────────────────────────────────────────────────────────────┤                                                                                                 
+  │ 2. Decode 阶段序列 - 按 num_generated 从小到大              │                                                                                                 
+  │    (已在生成，用户正在等待)                                  │                                                                                                
+  └─────────────────────────────────────────────────────────────┘                                                                                                 
+                                                                                                                                                                  
+  示例                                                                                                                                                            
+                                                                                                                                                                  
+  running_queue_ = [seq0, seq1, seq2, seq3]                                                                                                                       
+                                                                                                                                                                  
+  seq0: prefill, context_len=100  (prefill了100个token)                                                                                                           
+  seq1: prefill, context_len=20   ← 第1个驱逐目标 (prefill最短)                                                                                                   
+  seq2: decode,  num_generated=50 (已生成50个)                                                                                                                    
+  seq3: decode,  num_generated=5  ← 第2个驱逐目标 (decode最短)                                                                                                    
+                                                                                                                                                                  
+  驱逐顺序:                                                                                                                                                       
+  1. seq1 (prefill, context=20)   - 优先驱逐prefill中最短的                                                                                                       
+  2. seq0 (prefill, context=100)  - 其次prefill中第二短的                                                                                                         
+  3. seq3 (decode, generated=5)   - 再次decode中最短的                                                                                                            
+  4. seq2 (decode, generated=50)  - 最后decode中第二短的  
+ * 
+ * 
+ * 
+ * 
+ * 
+ * 
+ * 
+ */
+
+#include "base/scheduler.h"
+#include "base/kv_cache_manager.h"
+#include <algorithm>
+#include <glog/logging.h>
+
+namespace base {
+
+Scheduler::Scheduler(const SchedulerConfig& config) : config_(config) {}
+
+void Scheduler::set_kv_cache_manager(KVCacheManager* kv_manager) {
+    std::lock_guard<std::mutex> lock(mutex_);
+    kv_manager_ = kv_manager;
+}
+
+int32_t Scheduler::add_request(const std::vector<int32_t>& prompt_tokens,
+                                int32_t max_new_tokens,
+                                const std::string& original_text) {
+    std::lock_guard<std::mutex> lock(mutex_);
+
+    Sequence seq;
+    seq.seq_id = next_seq_id_++;
+    seq.prompt_tokens = prompt_tokens;
+    seq.max_new_tokens = max_new_tokens;
+    seq.original_text = original_text;
+    seq.status = SequenceStatus::WAITING;
+    seq.arrival_time = std::chrono::steady_clock::now();
+
+    int32_t seq_id = seq.seq_id;
+    sequences_[seq_id] = std::move(seq);
+    waiting_queue_.push_back(seq_id);
+
+    return seq_id;
+}
+
+SchedulerOutput Scheduler::schedule() {
+    std::lock_guard<std::mutex> lock(mutex_);
+
+    SchedulerOutput output;
+    int32_t remaining_tokens = config_.max_num_batched_tokens;
+    int32_t remaining_seqs = config_.max_num_seqs;
+
+    // Step 0: Check memory pressure and preempt if necessary
+    // Estimate blocks needed for decode (1 token per running decode sequence)
+    int32_t decode_blocks_needed = 0;
+    for (int32_t seq_id : running_queue_) {
+        auto it = sequences_.find(seq_id);
+        if (it != sequences_.end() && !it->second.is_prefill()) {
+            // Each decode may need a new block if crossing block boundary
+            decode_blocks_needed += estimate_blocks_needed(it->second, 1) -
+                                    estimate_blocks_needed(it->second, 0);
+        }
+    }
+
+    if (need_preemption(decode_blocks_needed)) {
+        int32_t preempted = preempt_sequences(decode_blocks_needed);
+        if (preempted > 0) {
+            for (int32_t i = 0; i < preempted; ++i) {
+                if (!preempted_queue_.empty()) {
+                    output.preempted_seq_ids.push_back(preempted_queue_.back());
+                }
+            }
+        }
+    }
+
+    if (config_.decode_priority) {
+        // Step 1: Schedule decode sequences first (each needs only 1 token)
+        schedule_decode(output, remaining_tokens, remaining_seqs);
+
+        // Step 2: Schedule running prefill sequences
+        schedule_prefill(output, remaining_tokens, remaining_seqs);
+
+        // Step 3: Try to re-admit preempted sequences (priority over new)
+        try_admit_preempted(output, remaining_tokens, remaining_seqs);
+
+        // Step 4: Try to admit new sequences from waiting queue
+        try_admit_waiting(output, remaining_tokens, remaining_seqs);
+    } else {
+        // FCFS: process in order
+        schedule_prefill(output, remaining_tokens, remaining_seqs);
+        schedule_decode(output, remaining_tokens, remaining_seqs);
+        try_admit_preempted(output, remaining_tokens, remaining_seqs);
+        try_admit_waiting(output, remaining_tokens, remaining_seqs);
+    }
+
+    return output;
+}
+
+void Scheduler::schedule_decode(SchedulerOutput& output, int32_t& remaining_tokens,
+                                 int32_t& remaining_seqs) {
+    for (int32_t seq_id : running_queue_) {
+        if (remaining_tokens < 1 || remaining_seqs < 1) break;
+
+        auto it = sequences_.find(seq_id);
+        if (it == sequences_.end()) continue;
+
+        Sequence& seq = it->second;
+        if (seq.status != SequenceStatus::RUNNING) continue;
+        if (seq.is_prefill()) continue;  // Still in prefill phase
+        if (seq.next_token < 0) continue;  // No token to decode
+
+        add_to_schedule(output, seq, 1, false);
+        remaining_tokens -= 1;
+        remaining_seqs -= 1;
+    }
+}
+
+void Scheduler::schedule_prefill(SchedulerOutput& output, int32_t& remaining_tokens,
+                                  int32_t& remaining_seqs) {
+    for (int32_t seq_id : running_queue_) {
+        if (remaining_tokens < 1 || remaining_seqs < 1) break;
+
+        auto it = sequences_.find(seq_id);
+        if (it == sequences_.end()) continue;
+
+        Sequence& seq = it->second;
+        if (seq.status != SequenceStatus::RUNNING) continue;
+        if (!seq.is_prefill()) continue;  // Already in decode phase
+
+        // Calculate chunk size
+        int32_t remaining_prefill = seq.remaining_prefill();
+        int32_t chunk_size = std::min({
+            remaining_tokens,
+            config_.max_prefill_tokens,
+            remaining_prefill
+        });
+
+        if (chunk_size > 0) {
+            add_to_schedule(output, seq, chunk_size, true);
+            remaining_tokens -= chunk_size;
+            remaining_seqs -= 1;
+        }
+    }
+}
+
+void Scheduler::try_admit_waiting(SchedulerOutput& output, int32_t& remaining_tokens,
+                                   int32_t& remaining_seqs) {
+    while (!waiting_queue_.empty() && remaining_tokens > 0 && remaining_seqs > 0) {
+        int32_t seq_id = waiting_queue_.front();
+
+        auto it = sequences_.find(seq_id);
+        if (it == sequences_.end()) {
+            waiting_queue_.pop_front();
+            continue;
+        }
+
+        Sequence& seq = it->second;
+
+        // Check if we can admit this sequence
+        if (!can_admit_new_sequence()) break;
+
+        // Check memory pressure before admitting
+        int32_t blocks_needed = estimate_blocks_needed(seq, seq.prompt_len());
+        if (need_preemption(blocks_needed)) break;
+
+        // Try to allocate KV slot
+        if (!admit_sequence(seq)) break;
+
+        waiting_queue_.pop_front();
+        running_queue_.push_back(seq_id);
+
+        // Schedule first prefill chunk
+        int32_t chunk_size = std::min({
+            remaining_tokens,
+            config_.max_prefill_tokens,
+            seq.prompt_len()
+        });
+
+        if (chunk_size > 0) {
+            add_to_schedule(output, seq, chunk_size, true);
+            remaining_tokens -= chunk_size;
+            remaining_seqs -= 1;
+        }
+    }
+}
+
+void Scheduler::try_admit_preempted(SchedulerOutput& output, int32_t& remaining_tokens,
+                                     int32_t& remaining_seqs) {
+    // Re-admit preempted sequences with priority over new requests
+    // Preempted sequences need to restart from beginning (KV cache was released)
+    while (!preempted_queue_.empty() && remaining_tokens > 0 && remaining_seqs > 0) {
+        int32_t seq_id = preempted_queue_.front();
+
+        auto it = sequences_.find(seq_id);
+        if (it == sequences_.end()) {
+            preempted_queue_.pop_front();
+            continue;
+        }
+
+        Sequence& seq = it->second;
+
+        // Check if we can admit this sequence
+        if (!can_admit_new_sequence()) break;
+
+        // Check memory pressure before admitting
+        int32_t blocks_needed = estimate_blocks_needed(seq, seq.prompt_len());
+        if (need_preemption(blocks_needed)) break;
+
+        // Reset sequence state for re-prefill
+        seq.prefill_pos = 0;
+        seq.output_tokens.clear();
+        seq.next_token = -1;
+
+        // Try to allocate KV slot
+        if (!admit_sequence(seq)) break;
+
+        preempted_queue_.pop_front();
+        running_queue_.push_back(seq_id);
+
+        // Schedule first prefill chunk
+        int32_t chunk_size = std::min({
+            remaining_tokens,
+            config_.max_prefill_tokens,
+            seq.prompt_len()
+        });
+
+        if (chunk_size > 0) {
+            add_to_schedule(output, seq, chunk_size, true);
+            remaining_tokens -= chunk_size;
+            remaining_seqs -= 1;
+        }
+
+        LOG(INFO) << "Re-admitted preempted sequence " << seq_id
+                  << " (preemptions=" << seq.num_preemptions << ")";
+    }
+}
+
+void Scheduler::add_to_schedule(SchedulerOutput& output, Sequence& seq,
+                                 int32_t num_tokens, bool is_prefill) {
+    ScheduledSequence sched_seq;
+    sched_seq.seq_id = seq.seq_id;
+    sched_seq.kv_slot = seq.kv_slot;
+    sched_seq.num_tokens = num_tokens;
+    sched_seq.is_prefill = is_prefill;
+
+    if (is_prefill) {
+        // Prefill: extract tokens from prompt
+        sched_seq.start_pos = seq.prefill_pos;
+        sched_seq.context_len = seq.prefill_pos + num_tokens;
+        sched_seq.tokens.assign(
+            seq.prompt_tokens.begin() + seq.prefill_pos,
+            seq.prompt_tokens.begin() + seq.prefill_pos + num_tokens
+        );
+        output.num_prefill_tokens += num_tokens;
+        output.num_prefill_seqs += 1;
+    } else {
+        // Decode: single token
+        sched_seq.start_pos = seq.context_len();
+        sched_seq.context_len = seq.context_len() + 1;
+        sched_seq.tokens = {seq.next_token};
+        output.num_decode_tokens += 1;
+        output.num_decode_seqs += 1;
+    }
+
+    output.num_tokens += num_tokens;
+    output.scheduled_seqs.push_back(std::move(sched_seq));
+}
+
+void Scheduler::update_after_forward(const SchedulerOutput& output,
+                                      const std::vector<int32_t>& next_tokens) {
+    std::lock_guard<std::mutex> lock(mutex_);
+
+    for (size_t i = 0; i < output.scheduled_seqs.size(); ++i) {
+        const auto& sched_seq = output.scheduled_seqs[i];
+        auto it = sequences_.find(sched_seq.seq_id);
+        if (it == sequences_.end()) continue;
+
+        Sequence& seq = it->second;
+
+        if (sched_seq.is_prefill) {
+            // Update prefill progress
+            seq.prefill_pos += sched_seq.num_tokens;
+
+            // If prefill just completed, record first token time
+            if (!seq.is_prefill() && seq.first_token_time.time_since_epoch().count() == 0) {
+                seq.first_token_time = std::chrono::steady_clock::now();
+            }
+        } else {
+            // Decode: add the token we just processed to output
+            seq.output_tokens.push_back(seq.next_token);
+        }
+
+        // Update next token from model output
+        if (i < next_tokens.size()) {
+            seq.next_token = next_tokens[i];
+        }
+
+        // Update KV cache context length
+        if (kv_manager_ != nullptr && seq.kv_slot >= 0) {
+            kv_manager_->set_context_len(seq.kv_slot, seq.context_len());
+        }
+    }
+}
+
+void Scheduler::finish_sequence(int32_t seq_id) {
+    std::lock_guard<std::mutex> lock(mutex_);
+
+    auto it = sequences_.find(seq_id);
+    if (it == sequences_.end()) return;
+
+    Sequence& seq = it->second;
+    if (seq.status == SequenceStatus::FINISHED) return;
+
+    seq.status = SequenceStatus::FINISHED;
+
+    // Remove from running queue
+    auto run_it = std::find(running_queue_.begin(), running_queue_.end(), seq_id);
+    if (run_it != running_queue_.end()) {
+        running_queue_.erase(run_it);
+    }
+
+    // Release KV slot via KVCacheManager
+    if (seq.kv_slot >= 0 && kv_manager_ != nullptr) {
+        kv_manager_->release_sequence_slot(seq.kv_slot);
+        seq.kv_slot = -1;
+    }
+
+    finished_seqs_.push_back(seq_id);
+}
+
+const Sequence* Scheduler::get_sequence(int32_t seq_id) const {
+    std::lock_guard<std::mutex> lock(mutex_);
+    auto it = sequences_.find(seq_id);
+    if (it == sequences_.end()) return nullptr;
+    return &it->second;
+}
+
+Sequence* Scheduler::get_sequence_mut(int32_t seq_id) {
+    std::lock_guard<std::mutex> lock(mutex_);
+    auto it = sequences_.find(seq_id);
+    if (it == sequences_.end()) return nullptr;
+    return &it->second;
+}
+
+bool Scheduler::all_finished() const {
+    std::lock_guard<std::mutex> lock(mutex_);
+    return waiting_queue_.empty() && running_queue_.empty() && preempted_queue_.empty();
+}
+
+bool Scheduler::has_pending_work() const {
+    std::lock_guard<std::mutex> lock(mutex_);
+    return !waiting_queue_.empty() || !running_queue_.empty() || !preempted_queue_.empty();
+}
+
+int32_t Scheduler::num_waiting() const {
+    std::lock_guard<std::mutex> lock(mutex_);
+    return static_cast<int32_t>(waiting_queue_.size());
+}
+
+int32_t Scheduler::num_running() const {
+    std::lock_guard<std::mutex> lock(mutex_);
+    return static_cast<int32_t>(running_queue_.size());
+}
+
+int32_t Scheduler::num_preempted() const {
+    std::lock_guard<std::mutex> lock(mutex_);
+    return static_cast<int32_t>(preempted_queue_.size());
+}
+
+int32_t Scheduler::num_finished() const {
+    std::lock_guard<std::mutex> lock(mutex_);
+    return static_cast<int32_t>(finished_seqs_.size());
+}
+
+std::vector<const Sequence*> Scheduler::get_finished_sequences() const {
+    std::lock_guard<std::mutex> lock(mutex_);
+    std::vector<const Sequence*> result;
+    result.reserve(finished_seqs_.size());
+    for (int32_t seq_id : finished_seqs_) {
+        auto it = sequences_.find(seq_id);
+        if (it != sequences_.end()) {
+            result.push_back(&it->second);
+        }
+    }
+    return result;
+}
+
+std::vector<const Sequence*> Scheduler::get_all_sequences() const {
+    std::lock_guard<std::mutex> lock(mutex_);
+    std::vector<const Sequence*> result;
+    result.reserve(sequences_.size());
+    for (const auto& pair : sequences_) {
+        result.push_back(&pair.second);
+    }
+    // Sort by seq_id for consistent ordering
+    std::sort(result.begin(), result.end(),
+              [](const Sequence* a, const Sequence* b) {
+                  return a->seq_id < b->seq_id;
+              });
+    return result;
+}
+
+bool Scheduler::can_admit_new_sequence() const {
+    // Check if KVCacheManager has free slots
+    if (kv_manager_ == nullptr) {
+        return false;
+    }
+    return kv_manager_->has_free_slot();
+}
+
+int32_t Scheduler::estimate_blocks_needed(const Sequence& seq, int32_t additional_tokens) const {
+    int32_t total_tokens = seq.context_len() + additional_tokens;
+    return (total_tokens + config_.block_size - 1) / config_.block_size;
+}
+
+bool Scheduler::admit_sequence(Sequence& seq) {
+    if (kv_manager_ == nullptr) {
+        return false;
+    }
+
+    // Allocate KV slot via KVCacheManager
+    int32_t kv_slot = kv_manager_->allocate_sequence_slot();
+    if (kv_slot < 0) {
+        return false;
+    }
+
+    seq.kv_slot = kv_slot;
+    seq.status = SequenceStatus::RUNNING;
+
+    // Pre-allocate KV cache blocks for prompt
+    if (!kv_manager_->allocate_blocks_for_tokens(kv_slot, seq.prompt_len())) {
+        // Failed to allocate blocks, release slot and return
+        kv_manager_->release_sequence_slot(kv_slot);
+        seq.kv_slot = -1;
+        seq.status = SequenceStatus::WAITING;
+        return false;
+    }
+
+    return true;
+}
+
+bool Scheduler::need_preemption(int32_t blocks_needed) const {
+    if (kv_manager_ == nullptr) {
+        return false;
+    }
+    int32_t free_blocks = kv_manager_->num_free_blocks();
+    // Need preemption if free blocks < needed + min_free_blocks buffer
+    return free_blocks < (blocks_needed + config_.min_free_blocks);
+}
+
+int32_t Scheduler::select_victim_sequence() const {
+    // Strategy (priority order):
+    // 1. Prefill sequences: select by smallest context_len (least computation invested)
+    // 2. Decode sequences: select by smallest num_generated (least output produced)
+    //
+    // Rationale:
+    // - Prefill sequences haven't produced any output yet, user impact is minimal
+    // - Decode sequences are actively generating, user is waiting for output
+
+    int32_t victim_id = -1;
+    int32_t min_metric = INT32_MAX;
+    bool found_prefill_victim = false;
+
+    // First pass: look for prefill sequences (higher priority to evict)
+    for (int32_t seq_id : running_queue_) {
+        auto it = sequences_.find(seq_id);
+        if (it == sequences_.end()) continue;
+
+        const Sequence& seq = it->second;
+        if (seq.status != SequenceStatus::RUNNING) continue;
+
+        if (seq.is_prefill()) {
+            // Prefill: use context_len as metric
+            int32_t ctx_len = seq.context_len();
+            if (ctx_len < min_metric) {
+                victim_id = seq_id;
+                min_metric = ctx_len;
+                found_prefill_victim = true;
+            }
+        }
+    }
+
+    // If found a prefill victim, return it
+    if (found_prefill_victim) {
+        return victim_id;
+    }
+
+    // Second pass: look for decode sequences (lower priority to evict)
+    min_metric = INT32_MAX;
+    for (int32_t seq_id : running_queue_) {
+        auto it = sequences_.find(seq_id);
+        if (it == sequences_.end()) continue;
+
+        const Sequence& seq = it->second;
+        if (seq.status != SequenceStatus::RUNNING) continue;
+
+        if (!seq.is_prefill()) {
+            // Decode: use num_generated as metric
+            int32_t generated = seq.num_generated();
+            if (generated < min_metric) {
+                victim_id = seq_id;
+                min_metric = generated;
+            }
+        }
+    }
+
+    return victim_id;
+}
+
+int32_t Scheduler::preempt_sequences(int32_t blocks_needed) {
+    int32_t num_preempted = 0;
+
+    while (need_preemption(blocks_needed) && !running_queue_.empty()) {
+        int32_t victim_id = select_victim_sequence();
+        if (victim_id < 0) break;
+
+        auto it = sequences_.find(victim_id);
+        if (it == sequences_.end()) break;
+
+        Sequence& victim = it->second;
+
+        // Release KV resources
+        if (victim.kv_slot >= 0 && kv_manager_ != nullptr) {
+            kv_manager_->release_sequence_slot(victim.kv_slot);
+            victim.kv_slot = -1;
+        }
+
+        // Update status
+        victim.status = SequenceStatus::PREEMPTED;
+        victim.num_preemptions++;
+
+        // Remove from running queue
+        auto run_it = std::find(running_queue_.begin(), running_queue_.end(), victim_id);
+        if (run_it != running_queue_.end()) {
+            running_queue_.erase(run_it);
+        }
+
+        // Add to preempted queue (will be rescheduled with priority)
+        preempted_queue_.push_back(victim_id);
+
+        num_preempted++;
+        LOG(WARNING) << "Preempted sequence " << victim_id
+                     << " (generated=" << victim.num_generated()
+                     << ", preemptions=" << victim.num_preemptions << ")";
+    }
+
+    return num_preempted;
+}
+
+}  // namespace base
\ No newline at end of file
diff --git a/kuiper/source/model/llama3.cpp b/kuiper/source/model/llama3.cpp
index 3b0d255..c64fbd1 100644
--- a/kuiper/source/model/llama3.cpp
+++ b/kuiper/source/model/llama3.cpp
@@ -1,6 +1,7 @@
 #include "model/llama3.h"
 #include <cuda_runtime_api.h>
 #include <glog/logging.h>
+#include <cmath>
 #include <op/matmul.h>
 #include <op/mha.h>
 #include <op/rmsnorm.h>
@@ -8,6 +9,8 @@
 #include <utility>
 #include "../op/kernels/cpu/rope_kernel.h"
 #include "../op/kernels/cuda/rope_kernel.cuh"
+#include "../op/kernels/cuda/paged_attention_kernel.cuh"
+#include "../op/kernels/kernels_interface.h"
 #include "base/tick.h"
 namespace model {
 
@@ -153,12 +156,18 @@ base::Status LLama2Model::forward(const tensor::Tensor& input, const tensor::Ten
     return base::error::InternalError("Unsupported int8 quant in the cpu device");
   }
 
+  const bool use_paged_attention = (kv_cache_manager_ != nullptr);
+
   for (int32_t layer_idx = 0; layer_idx < config_->layer_num_; ++layer_idx) {
     attention_rms(layer_idx, input);
     // attention (wq wk wv @ input)
     attention_qkv(layer_idx, pos_tensor);
     // multi-head attention
-    attention_mha(layer_idx, pos_tensor);
+    if (use_paged_attention) {
+      attention_mha_paged(layer_idx, pos_tensor);
+    } else {
+      attention_mha(layer_idx, pos_tensor);
+    }
     // feed forward
     feed_forward(layer_idx, input);
   }
@@ -675,6 +684,61 @@ void LLama2Model::attention_mha(int32_t layer_idx, const tensor::Tensor& pos_ten
   STATUS_CHECK(wo_layer->forward(mha_output, attn_output));
 }
 
+void LLama2Model::attention_mha_paged(int32_t layer_idx, const tensor::Tensor& pos_tensor) const {
+  CHECK(llama_layers_ != nullptr);
+  CHECK(kv_cache_manager_ != nullptr) << "KV cache manager is null for paged attention";
+
+  tensor::Tensor mha_output = get_buffer(ModelBufferType::kOutputMHA);
+  tensor::Tensor query = this->get_buffer(ModelBufferType::kQuery);
+
+  // Get paged KV cache tensors
+  const tensor::Tensor& kv_cache = kv_cache_manager_->kv_cache_device();
+  const tensor::Tensor& block_tables = kv_cache_manager_->block_tables_device();
+  const tensor::Tensor& context_lens = kv_cache_manager_->context_lens_device();
+
+  // Get parameters from config and kv_cache_manager
+  const int32_t num_heads = config_->head_num_;
+  const int32_t num_kv_heads = config_->kv_head_num_;
+  const int32_t head_dim = config_->head_size_;
+  const int32_t num_layers = config_->layer_num_;
+  const int32_t block_size = kv_cache_manager_->block_size();
+  const int32_t max_blocks_per_seq = kv_cache_manager_->max_blocks_per_seq();
+  // Get num_seqs from KVCacheManager instead of hardcoding
+  const int32_t num_seqs = kv_cache_manager_->num_seqs();
+  const int32_t max_context_len = kv_cache_manager_->get_context_len(0);
+
+  // Scale factor for attention: 1/sqrt(head_dim)
+  const float scale = 1.0f / std::sqrt(static_cast<float>(head_dim));
+
+  // Get CUDA stream
+  cudaStream_t stream = cuda_config_ ? cuda_config_->stream : nullptr;
+
+  // Call paged attention kernel
+  kernel::paged_attention_kernel_cu(
+      mha_output,
+      query,
+      kv_cache,
+      block_tables,
+      context_lens,
+      max_context_len,
+      num_seqs,
+      num_heads,
+      num_kv_heads,
+      head_dim,
+      block_size,
+      max_blocks_per_seq,
+      scale,
+      layer_idx,
+      num_layers,
+      stream);
+
+  // wo @ attention output
+  tensor::Tensor attn_output = get_buffer(ModelBufferType::kAttnOutput);
+  const auto& wo_layer = llama_layers_->wo_layers_.at(layer_idx);
+  CHECK_NE(wo_layer, nullptr) << "The weight output layer is null pointer.";
+  STATUS_CHECK(wo_layer->forward(mha_output, attn_output));
+}
+
 void LLama2Model::feed_forward(int32_t layer_idx, const tensor::Tensor& input) const {
   CHECK(llama_layers_ != nullptr);
   // residual add
@@ -744,4 +808,522 @@ int32_t LLama2Model::post_processing(const tensor::Tensor& pos, bool is_prompt)
   return next;
 }
 
+void LLama2Model::init_batch_buffers(int32_t num_seqs) {
+  if (num_seqs <= 1) {
+    return;  // No need for batch buffers with single sequence
+  }
+
+  std::shared_ptr<base::DeviceAllocator> alloc;
+  if (device_type_ == base::DeviceType::kDeviceCPU) {
+    alloc = base::CPUDeviceAllocatorFactory::get_instance();
+  } else {
+    alloc = base::CUDADeviceAllocatorFactory::get_instance();
+  }
+  std::shared_ptr<base::DeviceAllocator> alloc_cpu =
+      base::CPUDeviceAllocatorFactory::get_instance();
+
+  // Batch input embeddings: [num_seqs, dim]
+  tensor::Tensor batch_input(base::DataType::kDataTypeFp32, num_seqs, config_->dim_, true, alloc);
+  CHECK(insert_buffer(ModelBufferType::kBatchInputEmbeddings, batch_input));
+
+  // Batch positions: [num_seqs] on CPU (for reading), and GPU copy
+  tensor::Tensor batch_positions_cpu(base::DataType::kDataTypeInt32, num_seqs, true, alloc_cpu);
+  tensor::Tensor batch_positions_gpu(base::DataType::kDataTypeInt32, num_seqs, true, alloc);
+  CHECK(insert_buffer(ModelBufferType::kBatchPositions, batch_positions_gpu));
+
+  // Batch RMSNorm output: [num_seqs, dim]
+  tensor::Tensor batch_rms_output(base::DataType::kDataTypeFp32, num_seqs, config_->dim_, true, alloc);
+  CHECK(insert_buffer(ModelBufferType::kBatchRMSNormOutput, batch_rms_output));
+
+  // Batch Query: [num_seqs, dim]
+  tensor::Tensor batch_query(base::DataType::kDataTypeFp32, num_seqs, config_->dim_, true, alloc);
+  CHECK(insert_buffer(ModelBufferType::kBatchQuery, batch_query));
+
+  // Batch Key: [num_seqs, kv_dim]
+  tensor::Tensor batch_key(base::DataType::kDataTypeFp32, num_seqs, config_->kv_dim_, true, alloc);
+  CHECK(insert_buffer(ModelBufferType::kBatchKey, batch_key));
+
+  // Batch Value: [num_seqs, kv_dim]
+  tensor::Tensor batch_value(base::DataType::kDataTypeFp32, num_seqs, config_->kv_dim_, true, alloc);
+  CHECK(insert_buffer(ModelBufferType::kBatchValue, batch_value));
+
+  // Batch MHA output: [num_seqs, dim]
+  tensor::Tensor batch_mha_output(base::DataType::kDataTypeFp32, num_seqs, config_->dim_, true, alloc);
+  CHECK(insert_buffer(ModelBufferType::kBatchOutputMHA, batch_mha_output));
+
+  // Batch Attention output: [num_seqs, dim]
+  tensor::Tensor batch_attn_output(base::DataType::kDataTypeFp32, num_seqs, config_->dim_, true, alloc);
+  CHECK(insert_buffer(ModelBufferType::kBatchAttnOutput, batch_attn_output));
+
+  // Batch FFN RMSNorm: [num_seqs, dim]
+  tensor::Tensor batch_ffn_rms(base::DataType::kDataTypeFp32, num_seqs, config_->dim_, true, alloc);
+  CHECK(insert_buffer(ModelBufferType::kBatchFFNRMSNorm, batch_ffn_rms));
+
+  // Batch W1 output: [num_seqs, hidden_dim]
+  tensor::Tensor batch_w1_output(base::DataType::kDataTypeFp32, num_seqs, config_->hidden_dim_, true, alloc);
+  CHECK(insert_buffer(ModelBufferType::kBatchW1Output, batch_w1_output));
+
+  // Batch W3 output: [num_seqs, hidden_dim]
+  tensor::Tensor batch_w3_output(base::DataType::kDataTypeFp32, num_seqs, config_->hidden_dim_, true, alloc);
+  CHECK(insert_buffer(ModelBufferType::kBatchW3Output, batch_w3_output));
+
+  // Batch W2 output: [num_seqs, dim]
+  tensor::Tensor batch_w2_output(base::DataType::kDataTypeFp32, num_seqs, config_->dim_, true, alloc);
+  CHECK(insert_buffer(ModelBufferType::kBatchW2Output, batch_w2_output));
+
+  // Batch forward output: [num_seqs, vocab_size] - logits for each sequence
+  tensor::Tensor batch_forward_output(base::DataType::kDataTypeFp32, num_seqs, config_->vocab_size_, true, alloc);
+  CHECK(insert_buffer(ModelBufferType::kBatchForwardOutput, batch_forward_output));
+
+  LOG(INFO) << "Initialized batch buffers for " << num_seqs << " sequences";
+}
+
+base::Status LLama2Model::forward_batch(const tensor::Tensor& inputs,
+                                        const tensor::Tensor& positions,
+                                        std::vector<int32_t>& next_tokens) const {
+  // inputs: [num_seqs, dim]
+  // positions: [num_seqs] on GPU
+  if (inputs.is_empty()) {
+    return base::error::InvalidArgument("The input tensor is empty.");
+  }
+
+  const int32_t num_seqs = kv_cache_manager_->num_seqs();
+  if (num_seqs <= 1) {
+    return base::error::InvalidArgument("forward_batch requires num_seqs > 1");
+  }
+
+  // Get batch buffers
+  tensor::Tensor batch_rms_output = get_buffer(ModelBufferType::kBatchRMSNormOutput);
+  tensor::Tensor batch_query = get_buffer(ModelBufferType::kBatchQuery);
+  tensor::Tensor batch_key = get_buffer(ModelBufferType::kBatchKey);
+  tensor::Tensor batch_value = get_buffer(ModelBufferType::kBatchValue);
+  tensor::Tensor batch_mha_output = get_buffer(ModelBufferType::kBatchOutputMHA);
+  tensor::Tensor batch_attn_output = get_buffer(ModelBufferType::kBatchAttnOutput);
+  tensor::Tensor batch_ffn_rms = get_buffer(ModelBufferType::kBatchFFNRMSNorm);
+  tensor::Tensor batch_w1_output = get_buffer(ModelBufferType::kBatchW1Output);
+  tensor::Tensor batch_w3_output = get_buffer(ModelBufferType::kBatchW3Output);
+  tensor::Tensor batch_w2_output = get_buffer(ModelBufferType::kBatchW2Output);
+
+  // Get paged KV cache tensors
+  const tensor::Tensor& kv_cache = kv_cache_manager_->kv_cache_device();
+  const tensor::Tensor& block_tables = kv_cache_manager_->block_tables_device();
+  const tensor::Tensor& context_lens = kv_cache_manager_->context_lens_device();
+
+  // Get parameters
+  const int32_t num_heads = config_->head_num_;
+  const int32_t num_kv_heads = config_->kv_head_num_;
+  const int32_t head_dim = config_->head_size_;
+  const int32_t num_layers = config_->layer_num_;
+  const int32_t block_size = kv_cache_manager_->block_size();
+  const int32_t max_blocks_per_seq = kv_cache_manager_->max_blocks_per_seq();
+  const int32_t dim = config_->dim_;
+  const int32_t kv_dim = config_->kv_dim_;
+  const float scale = 1.0f / std::sqrt(static_cast<float>(head_dim));
+
+  cudaStream_t stream = cuda_config_ ? cuda_config_->stream : nullptr;
+
+  // Copy input to working buffer
+  tensor::Tensor working_input = inputs;  // [num_seqs, dim]
+
+  for (int32_t layer_idx = 0; layer_idx < num_layers; ++layer_idx) {
+    // 1. Attention RMSNorm (batch) - use dim version for batch
+    auto rmsnorm_layer = std::dynamic_pointer_cast<op::LayerParam>(
+        llama_layers_->rmsnorm_layers_.at(layer_idx));
+    kernel::get_rmsnorm_dim_kernel(device_type_)(working_input, rmsnorm_layer->get_weight(0),
+                                                  batch_rms_output, dim, stream);
+
+    // 2. QKV projection (batch GEMM)
+    // Q = rms_output @ Wq^T
+    auto wq_layer = std::dynamic_pointer_cast<op::LayerParam>(
+        llama_layers_->wq_layers_.at(layer_idx));
+    kernel::get_gemm_kernel(device_type_)(batch_rms_output, wq_layer->get_weight(0),
+                                           batch_query, cuda_config_.get());
+
+    // K = rms_output @ Wk^T
+    auto wk_layer = std::dynamic_pointer_cast<op::LayerParam>(
+        llama_layers_->wk_layers_.at(layer_idx));
+    kernel::get_gemm_kernel(device_type_)(batch_rms_output, wk_layer->get_weight(0),
+                                           batch_key, cuda_config_.get());
+
+    // V = rms_output @ Wv^T
+    auto wv_layer = std::dynamic_pointer_cast<op::LayerParam>(
+        llama_layers_->wv_layers_.at(layer_idx));
+    kernel::get_gemm_kernel(device_type_)(batch_rms_output, wv_layer->get_weight(0),
+                                           batch_value, cuda_config_.get());
+
+    // 3. RoPE (batch)
+    kernel::get_rope_batch_kernel(device_type_)(
+        num_seqs, dim, kv_dim, head_dim,
+        batch_query, batch_key, positions,
+        get_buffer(ModelBufferType::kSinCache),
+        get_buffer(ModelBufferType::kCosCache),
+        stream);
+
+    // 4. Write K, V to paged KV cache (batch)
+    kernel::batch_kv_cache_write_cu(
+        const_cast<tensor::Tensor&>(kv_cache),
+        batch_key,
+        batch_value,
+        block_tables,
+        positions,
+        num_seqs,
+        num_kv_heads,
+        head_dim,
+        block_size,
+        max_blocks_per_seq,
+        layer_idx,
+        num_layers,
+        stream);
+
+    // 5. Paged Attention (already supports batch)
+    // Get max context len across all sequences
+    int32_t max_context_len = 0;
+    for (int32_t s = 0; s < num_seqs; ++s) {
+      int32_t ctx_len = kv_cache_manager_->get_context_len(s);
+      if (ctx_len > max_context_len) max_context_len = ctx_len;
+    }
+
+    kernel::paged_attention_kernel_cu(
+        batch_mha_output,
+        batch_query,
+        kv_cache,
+        block_tables,
+        context_lens,
+        max_context_len,
+        num_seqs,
+        num_heads,
+        num_kv_heads,
+        head_dim,
+        block_size,
+        max_blocks_per_seq,
+        scale,
+        layer_idx,
+        num_layers,
+        stream);
+
+    // 6. Output projection: O = mha_output @ Wo^T
+    auto wo_layer = std::dynamic_pointer_cast<op::LayerParam>(
+        llama_layers_->wo_layers_.at(layer_idx));
+    kernel::get_gemm_kernel(device_type_)(batch_mha_output, wo_layer->get_weight(0),
+                                           batch_attn_output, cuda_config_.get());
+
+    // 7. Residual add: working_input += attn_output
+    kernel::get_add_kernel(device_type_)(working_input, batch_attn_output, working_input, stream);
+
+    // 8. FFN RMSNorm
+    auto ffn_rmsnorm = std::dynamic_pointer_cast<op::LayerParam>(
+        llama_layers_->rmsnorm_layers_.at(layer_idx + num_layers));
+    kernel::get_rmsnorm_dim_kernel(device_type_)(working_input, ffn_rmsnorm->get_weight(0),
+                                                  batch_ffn_rms, dim, stream);
+
+    // 9. FFN: W1 and W3
+    auto w1_layer = std::dynamic_pointer_cast<op::LayerParam>(
+        llama_layers_->w1_layers_.at(layer_idx));
+    kernel::get_gemm_kernel(device_type_)(batch_ffn_rms, w1_layer->get_weight(0),
+                                           batch_w1_output, cuda_config_.get());
+
+    auto w3_layer = std::dynamic_pointer_cast<op::LayerParam>(
+        llama_layers_->w3_layers_.at(layer_idx));
+    kernel::get_gemm_kernel(device_type_)(batch_ffn_rms, w3_layer->get_weight(0),
+                                           batch_w3_output, cuda_config_.get());
+
+    // 10. SwiGLU
+    kernel::get_swiglu_kernel(device_type_)(batch_w1_output, batch_w3_output, batch_w1_output, stream);
+
+    // 11. W2
+    auto w2_layer = std::dynamic_pointer_cast<op::LayerParam>(
+        llama_layers_->w2_layers_.at(layer_idx));
+    kernel::get_gemm_kernel(device_type_)(batch_w1_output, w2_layer->get_weight(0),
+                                           batch_w2_output, cuda_config_.get());
+
+    // 12. Residual add
+    kernel::get_add_kernel(device_type_)(working_input, batch_w2_output, working_input, stream);
+  }
+
+  // Final RMSNorm
+  auto final_rmsnorm = std::dynamic_pointer_cast<op::LayerParam>(
+      llama_layers_->rmsnorm_layers_.at(2 * num_layers));
+  kernel::get_rmsnorm_dim_kernel(device_type_)(working_input, final_rmsnorm->get_weight(0),
+                                                batch_rms_output, dim, stream);
+
+  // Classification layer: logits = rms_output @ cls_weight^T
+  // Output: [num_seqs, vocab_size]
+  tensor::Tensor batch_forward_output = get_buffer(ModelBufferType::kBatchForwardOutput);
+  auto cls_layer = std::dynamic_pointer_cast<op::LayerParam>(llama_layers_->cls_layer_);
+  kernel::get_gemm_kernel(device_type_)(batch_rms_output, cls_layer->get_weight(0),
+                                         batch_forward_output, cuda_config_.get());
+
+  cudaStreamSynchronize(stream);
+
+  // Sample next token for each sequence
+  next_tokens.resize(num_seqs);
+  const int32_t vocab_size = config_->vocab_size_;
+  const float* logits_ptr = batch_forward_output.ptr<float>();
+
+  for (int32_t s = 0; s < num_seqs; ++s) {
+    const float* seq_logits = logits_ptr + s * vocab_size;
+    next_tokens[s] = static_cast<int32_t>(sampler_->sample(seq_logits, vocab_size, stream));
+  }
+
+  return base::error::Success();
+}
+
+// ============================================================================
+// Unified forward implementation
+// Supports: single decode, batch decode, chunked prefill, mixed prefill+decode
+// ============================================================================
+
+void LLama2Model::ensure_unified_buffers(int32_t num_tokens) const {
+  // Cast away const for buffer initialization (buffers are mutable cache)
+  const_cast<LLama2Model*>(this)->init_unified_buffers(num_tokens);
+}
+
+base::Status LLama2Model::forward_unified(
+    const tensor::Tensor& hidden_states,
+    const base::BatchMetadata& batch_meta,
+    std::vector<int32_t>& next_tokens) const {
+
+  if (hidden_states.is_empty()) {
+    return base::error::InvalidArgument("The hidden_states tensor is empty.");
+  }
+
+  CHECK(kv_cache_manager_ != nullptr) << "KV cache manager not initialized for unified forward";
+
+  const int32_t num_tokens = batch_meta.num_tokens;
+  const int32_t num_seqs = batch_meta.num_seqs;
+  const bool has_prefill = batch_meta.has_prefill();
+  const bool has_decode = batch_meta.has_decode();
+
+  // Ensure we have buffers for this many tokens
+  ensure_unified_buffers(num_tokens);
+
+  cudaStream_t stream = cuda_config_ ? cuda_config_->stream : nullptr;
+
+  // Get unified buffers
+  tensor::Tensor unified_rms_output = get_buffer(ModelBufferType::kUnifiedRMSNormOutput);
+  tensor::Tensor unified_query = get_buffer(ModelBufferType::kUnifiedQuery);
+  tensor::Tensor unified_key = get_buffer(ModelBufferType::kUnifiedKey);
+  tensor::Tensor unified_value = get_buffer(ModelBufferType::kUnifiedValue);
+  tensor::Tensor unified_mha_output = get_buffer(ModelBufferType::kUnifiedMHAOutput);
+  tensor::Tensor unified_attn_output = get_buffer(ModelBufferType::kUnifiedAttnOutput);
+  tensor::Tensor unified_ffn_rms = get_buffer(ModelBufferType::kUnifiedFFNRMSNorm);
+  tensor::Tensor unified_w1_output = get_buffer(ModelBufferType::kUnifiedW1Output);
+  tensor::Tensor unified_w3_output = get_buffer(ModelBufferType::kUnifiedW3Output);
+  tensor::Tensor unified_w2_output = get_buffer(ModelBufferType::kUnifiedW2Output);
+
+  // Reshape buffers for current num_tokens
+  unified_rms_output.reshape({num_tokens, config_->dim_});
+  unified_query.reshape({num_tokens, config_->dim_});
+  unified_key.reshape({num_tokens, config_->kv_dim_});
+  unified_value.reshape({num_tokens, config_->kv_dim_});
+  unified_mha_output.reshape({num_tokens, config_->dim_});
+  unified_attn_output.reshape({num_tokens, config_->dim_});
+  unified_ffn_rms.reshape({num_tokens, config_->dim_});
+  unified_w1_output.reshape({num_tokens, config_->hidden_dim_});
+  unified_w3_output.reshape({num_tokens, config_->hidden_dim_});
+  unified_w2_output.reshape({num_tokens, config_->dim_});
+
+  // Get paged KV cache tensors
+  const tensor::Tensor& kv_cache = kv_cache_manager_->kv_cache_device();
+  const tensor::Tensor& block_tables = kv_cache_manager_->block_tables_device();
+  const tensor::Tensor& context_lens = kv_cache_manager_->context_lens_device();
+
+  // Get parameters
+  const int32_t num_heads = config_->head_num_;
+  const int32_t num_kv_heads = config_->kv_head_num_;
+  const int32_t head_dim = config_->head_size_;
+  const int32_t num_layers = config_->layer_num_;
+  const int32_t block_size = kv_cache_manager_->block_size();
+  const int32_t max_blocks_per_seq = kv_cache_manager_->max_blocks_per_seq();
+  const int32_t dim = config_->dim_;
+  const int32_t kv_dim = config_->kv_dim_;
+  const float scale = 1.0f / std::sqrt(static_cast<float>(head_dim));
+
+  // Working tensor - copy of hidden_states that we modify in-place
+  tensor::Tensor working_input = hidden_states;  // [num_tokens, dim]
+
+  for (int32_t layer_idx = 0; layer_idx < num_layers; ++layer_idx) {
+    // 1. Attention RMSNorm: [num_tokens, dim] -> [num_tokens, dim]
+    auto rmsnorm_layer = std::dynamic_pointer_cast<op::LayerParam>(
+        llama_layers_->rmsnorm_layers_.at(layer_idx));
+    kernel::get_rmsnorm_dim_kernel(device_type_)(
+        working_input, rmsnorm_layer->get_weight(0), unified_rms_output, dim, stream);
+
+    // 2. QKV Projection using GEMM
+    auto wq_layer = std::dynamic_pointer_cast<op::LayerParam>(
+        llama_layers_->wq_layers_.at(layer_idx));
+    kernel::get_gemm_kernel(device_type_)(
+        unified_rms_output, wq_layer->get_weight(0), unified_query, cuda_config_.get());
+
+    auto wk_layer = std::dynamic_pointer_cast<op::LayerParam>(
+        llama_layers_->wk_layers_.at(layer_idx));
+    kernel::get_gemm_kernel(device_type_)(
+        unified_rms_output, wk_layer->get_weight(0), unified_key, cuda_config_.get());
+
+    auto wv_layer = std::dynamic_pointer_cast<op::LayerParam>(
+        llama_layers_->wv_layers_.at(layer_idx));
+    kernel::get_gemm_kernel(device_type_)(
+        unified_rms_output, wv_layer->get_weight(0), unified_value, cuda_config_.get());
+
+    // 3. RoPE: Apply rotary position embeddings
+    kernel::get_rope_batch_kernel(device_type_)(
+        num_tokens, dim, kv_dim, head_dim,
+        unified_query, unified_key, batch_meta.positions,
+        get_buffer(ModelBufferType::kSinCache),
+        get_buffer(ModelBufferType::kCosCache),
+        stream);
+
+    // 4. Write K, V to paged KV cache using v2 kernel with seq_ids
+    kernel::batch_kv_cache_write_v2_cu(
+        const_cast<tensor::Tensor&>(kv_cache),
+        unified_key,
+        unified_value,
+        block_tables,
+        batch_meta.positions,
+        batch_meta.seq_ids,
+        num_tokens,
+        num_kv_heads,
+        head_dim,
+        block_size,
+        max_blocks_per_seq,
+        layer_idx,
+        num_layers,
+        stream);
+
+    // 5. Attention: Choose kernel based on scenario
+    int32_t max_context_len = batch_meta.max_context_len();
+
+    if (has_prefill) {
+      // Use prefill and decode mixed kernel 
+      kernel::paged_attention_prefill_cu(
+          unified_mha_output,
+          unified_query,
+          kv_cache,
+          block_tables,
+          context_lens,
+          batch_meta.seq_ids,
+          batch_meta.positions,
+          num_tokens,
+          num_seqs,
+          num_heads,
+          num_kv_heads,
+          head_dim,
+          block_size,
+          max_blocks_per_seq,
+          scale,
+          layer_idx,
+          num_layers,
+          stream);
+    } else {
+      // Pure decode: use standard paged attention kernel
+      kernel::paged_attention_kernel_cu(
+          unified_mha_output,
+          unified_query,
+          kv_cache,
+          block_tables,
+          context_lens,
+          max_context_len,
+          num_seqs,
+          num_heads,
+          num_kv_heads,
+          head_dim,
+          block_size,
+          max_blocks_per_seq,
+          scale,
+          layer_idx,
+          num_layers,
+          stream);
+    }
+
+    // 6. Output projection: O = mha_output @ Wo^T
+    auto wo_layer = std::dynamic_pointer_cast<op::LayerParam>(
+        llama_layers_->wo_layers_.at(layer_idx));
+    kernel::get_gemm_kernel(device_type_)(
+        unified_mha_output, wo_layer->get_weight(0), unified_attn_output, cuda_config_.get());
+
+    // 7. Residual add: working_input += attn_output
+    kernel::get_add_kernel(device_type_)(working_input, unified_attn_output, working_input, stream);
+
+    // 8. FFN RMSNorm
+    auto ffn_rmsnorm = std::dynamic_pointer_cast<op::LayerParam>(
+        llama_layers_->rmsnorm_layers_.at(layer_idx + num_layers));
+    kernel::get_rmsnorm_dim_kernel(device_type_)(
+        working_input, ffn_rmsnorm->get_weight(0), unified_ffn_rms, dim, stream);
+
+    // 9. FFN: W1 and W3
+    auto w1_layer = std::dynamic_pointer_cast<op::LayerParam>(
+        llama_layers_->w1_layers_.at(layer_idx));
+    kernel::get_gemm_kernel(device_type_)(
+        unified_ffn_rms, w1_layer->get_weight(0), unified_w1_output, cuda_config_.get());
+
+    auto w3_layer = std::dynamic_pointer_cast<op::LayerParam>(
+        llama_layers_->w3_layers_.at(layer_idx));
+    kernel::get_gemm_kernel(device_type_)(
+        unified_ffn_rms, w3_layer->get_weight(0), unified_w3_output, cuda_config_.get());
+
+    // 10. SwiGLU
+    kernel::get_swiglu_kernel(device_type_)(unified_w1_output, unified_w3_output, unified_w1_output, stream);
+
+    // 11. W2
+    auto w2_layer = std::dynamic_pointer_cast<op::LayerParam>(
+        llama_layers_->w2_layers_.at(layer_idx));
+    kernel::get_gemm_kernel(device_type_)(
+        unified_w1_output, w2_layer->get_weight(0), unified_w2_output, cuda_config_.get());
+
+    // 12. Residual add
+    kernel::get_add_kernel(device_type_)(working_input, unified_w2_output, working_input, stream);
+  }
+
+  // Final RMSNorm
+  auto final_rmsnorm = std::dynamic_pointer_cast<op::LayerParam>(
+      llama_layers_->rmsnorm_layers_.at(2 * num_layers));
+  kernel::get_rmsnorm_dim_kernel(device_type_)(
+      working_input, final_rmsnorm->get_weight(0), unified_rms_output, dim, stream);
+
+  // Get last token indices for each sequence (for logits computation)
+  std::vector<int32_t> last_token_indices = batch_meta.get_last_token_indices();
+
+  // Compute logits only for the last token of each sequence
+  // For efficiency, we extract last tokens and do a smaller GEMM
+  next_tokens.resize(num_seqs);
+  auto cls_layer = std::dynamic_pointer_cast<op::LayerParam>(llama_layers_->cls_layer_);
+  const int32_t vocab_size = config_->vocab_size_;
+
+  // Allocate temporary buffer for last token hidden states and logits
+  std::shared_ptr<base::DeviceAllocator> alloc;
+  if (device_type_ == base::DeviceType::kDeviceCUDA) {
+    alloc = base::CUDADeviceAllocatorFactory::get_instance();
+  } else {
+    alloc = base::CPUDeviceAllocatorFactory::get_instance();
+  }
+
+  tensor::Tensor last_hidden(base::DataType::kDataTypeFp32, num_seqs, dim, true, alloc);
+  tensor::Tensor logits(base::DataType::kDataTypeFp32, num_seqs, vocab_size, true, alloc);
+
+  // Copy last token hidden states
+  for (int32_t s = 0; s < num_seqs; ++s) {
+    int32_t last_idx = last_token_indices[s];
+    cudaMemcpyAsync(
+        last_hidden.ptr<float>(s * dim),
+        unified_rms_output.ptr<float>(last_idx * dim),
+        dim * sizeof(float),
+        cudaMemcpyDeviceToDevice,
+        stream);
+  }
+
+  // Compute logits: [num_seqs, dim] @ [vocab_size, dim]^T -> [num_seqs, vocab_size]
+  kernel::get_gemm_kernel(device_type_)(last_hidden, cls_layer->get_weight(0), logits, cuda_config_.get());
+
+  cudaStreamSynchronize(stream);
+
+  // Sample next token for each sequence
+  const float* logits_ptr = logits.ptr<float>();
+  for (int32_t s = 0; s < num_seqs; ++s) {
+    const float* seq_logits = logits_ptr + s * vocab_size;
+    next_tokens[s] = static_cast<int32_t>(sampler_->sample(seq_logits, vocab_size, stream));
+  }
+
+  return base::error::Success();
+}
+
 }  // namespace model
\ No newline at end of file
diff --git a/kuiper/source/model/model.cpp b/kuiper/source/model/model.cpp
index b20b709..34bcd6c 100644
--- a/kuiper/source/model/model.cpp
+++ b/kuiper/source/model/model.cpp
@@ -1,4 +1,5 @@
 #include "model/model.h"
+#include <algorithm> 
 #include <fcntl.h>
 #include <sys/mman.h>
 #include <sys/stat.h>
@@ -18,9 +19,9 @@ const std::string& Model::token_path() const { return token_path_; }
 const std::string& Model::model_path() const { return model_path_; }
 
 base::Status Model::insert_buffer(ModelBufferType buffer_idx, const tensor::Tensor& tensor) {
-  if (buffers_.count(buffer_idx) > 0) {
-    return base::error::KeyHasExits(std::to_string(int(buffer_idx)) + " has exits in the buffers");
-  }
+  // if (buffers_.count(buffer_idx) > 0) {
+  //   return base::error::KeyHasExits(std::to_string(int(buffer_idx)) + " has exits in the buffers");
+  // }
   if (tensor.is_empty()) {
     return base::error::InvalidArgument("The tensor is empty for inserting buffer.");
   }
@@ -212,8 +213,43 @@ std::string Model::decode(std::vector<int32_t> token_idxs) const {
   return this->encode_layer_->decode(token_idxs);
 }
 
+base::Status Model::init_paged_kv_cache(int32_t num_seqs, int32_t block_size,
+                                     int32_t max_blocks, int32_t max_blocks_per_seq) {
+  CHECK(config_ != nullptr);
+  CHECK_GT(num_seqs, 0);
+  CHECK_GT(block_size, 0);
+  CHECK_GT(max_blocks, 0);
+
+  // Store num_seqs for batch processing
+  num_seqs_ = num_seqs;
+
+  if (max_blocks_per_seq <= 0) {
+    max_blocks_per_seq = (config_->seq_len_ + block_size - 1) / block_size;
+  }
+
+  std::shared_ptr<base::DeviceAllocator> alloc_dev;
+  if (device_type_ == base::DeviceType::kDeviceCPU) {
+    alloc_dev = base::CPUDeviceAllocatorFactory::get_instance();
+  } else {
+    alloc_dev = base::CUDADeviceAllocatorFactory::get_instance();
+  }
+  std::shared_ptr<base::DeviceAllocator> alloc_cpu = base::CPUDeviceAllocatorFactory::get_instance();
+
+  if (!kv_cache_manager_) {
+    kv_cache_manager_ = std::make_unique<base::KVCacheManager>();
+  }
+
+  return kv_cache_manager_->init_paged_token_major(
+      device_type_, alloc_dev, alloc_cpu, num_seqs, config_->layer_num_, config_->kv_head_num_,
+      config_->head_size_, block_size, max_blocks, max_blocks_per_seq, nullptr);
+}
+
 std::pair<tensor::Tensor, tensor::Tensor> Model::slice_kv_cache(int32_t layer_idx,
                                                                 int32_t token_pos) const {
+  if (kv_cache_manager_) {
+    return kv_cache_manager_->slice_kv(0, layer_idx, token_pos); // 这里默认只有一个seq
+  }
+
   int32_t layer_offset = layer_idx * config_->seq_len_ * config_->kv_dim_;
   int32_t cache_offset = layer_offset + token_pos * config_->kv_dim_;
 
@@ -258,4 +294,244 @@ tensor::Tensor Model::fill_input(const tensor::Tensor& pos_tensor,
   return input;
 }
 
+// ============================================================================
+// Unified forward interface implementation
+// ============================================================================
+
+base::Status Model::forward_unified(
+    const tensor::Tensor& hidden_states,
+    const base::BatchMetadata& batch_meta,
+    std::vector<int32_t>& next_tokens) const {
+  // Default implementation - subclasses should override
+  return base::error::FunctionNotImplement("forward_unified not implemented in base Model class");
+}
+
+void Model::init_unified_buffers(int32_t max_tokens) {
+  if (max_tokens <= max_unified_tokens_) {
+    return;  // Already have enough capacity
+  }
+
+  std::shared_ptr<base::DeviceAllocator> alloc;
+  if (device_type_ == base::DeviceType::kDeviceCPU) {
+    alloc = base::CPUDeviceAllocatorFactory::get_instance();
+  } else {
+    alloc = base::CUDADeviceAllocatorFactory::get_instance();
+  }
+
+  const int32_t dim = config_->dim_;
+  const int32_t kv_dim = config_->kv_dim_;
+  const int32_t hidden_dim = config_->hidden_dim_;
+
+  // Allocate unified buffers for [max_tokens, dim/kv_dim/hidden_dim]
+  tensor::Tensor unified_query(base::DataType::kDataTypeFp32, max_tokens, dim, true, alloc);
+  tensor::Tensor unified_key(base::DataType::kDataTypeFp32, max_tokens, kv_dim, true, alloc);
+  tensor::Tensor unified_value(base::DataType::kDataTypeFp32, max_tokens, kv_dim, true, alloc);
+  tensor::Tensor unified_rms_output(base::DataType::kDataTypeFp32, max_tokens, dim, true, alloc);
+  tensor::Tensor unified_mha_output(base::DataType::kDataTypeFp32, max_tokens, dim, true, alloc);
+  tensor::Tensor unified_attn_output(base::DataType::kDataTypeFp32, max_tokens, dim, true, alloc);
+  tensor::Tensor unified_ffn_rms(base::DataType::kDataTypeFp32, max_tokens, dim, true, alloc);
+  tensor::Tensor unified_w1_output(base::DataType::kDataTypeFp32, max_tokens, hidden_dim, true, alloc);
+  tensor::Tensor unified_w3_output(base::DataType::kDataTypeFp32, max_tokens, hidden_dim, true, alloc);
+  tensor::Tensor unified_w2_output(base::DataType::kDataTypeFp32, max_tokens, dim, true, alloc);
+
+  buffers_[ModelBufferType::kUnifiedQuery] = unified_query;
+  buffers_[ModelBufferType::kUnifiedKey] = unified_key;
+  buffers_[ModelBufferType::kUnifiedValue] = unified_value;
+  buffers_[ModelBufferType::kUnifiedRMSNormOutput] = unified_rms_output;
+  buffers_[ModelBufferType::kUnifiedMHAOutput] = unified_mha_output;
+  buffers_[ModelBufferType::kUnifiedAttnOutput] = unified_attn_output;
+  buffers_[ModelBufferType::kUnifiedFFNRMSNorm] = unified_ffn_rms;
+  buffers_[ModelBufferType::kUnifiedW1Output] = unified_w1_output;
+  buffers_[ModelBufferType::kUnifiedW3Output] = unified_w3_output;
+  buffers_[ModelBufferType::kUnifiedW2Output] = unified_w2_output;
+
+  max_unified_tokens_ = max_tokens;
+
+  LOG(INFO) << "Initialized unified buffers for max " << max_tokens << " tokens";
+}
+
+base::Status Model::forward_decode(
+    const std::vector<int32_t>& seq_ids,
+    const std::vector<int32_t>& positions,
+    const tensor::Tensor& hidden_states,
+    std::vector<int32_t>& next_tokens) const {
+  CHECK(kv_cache_manager_ != nullptr) << "KV cache manager not initialized";
+
+  std::shared_ptr<base::DeviceAllocator> alloc_gpu;
+  std::shared_ptr<base::DeviceAllocator> alloc_cpu;
+  if (device_type_ == base::DeviceType::kDeviceCUDA) {
+    alloc_gpu = base::CUDADeviceAllocatorFactory::get_instance();
+  } else {
+    alloc_gpu = base::CPUDeviceAllocatorFactory::get_instance();
+  }
+  alloc_cpu = base::CPUDeviceAllocatorFactory::get_instance();
+
+  // Get context lengths from KV cache manager
+  std::vector<int32_t> context_lens(seq_ids.size());
+  for (size_t i = 0; i < seq_ids.size(); ++i) {
+    context_lens[i] = kv_cache_manager_->get_context_len(seq_ids[i]);
+  }
+
+  auto batch_meta = base::BatchMetadata::create_decode(
+      seq_ids, positions, context_lens, alloc_gpu, alloc_cpu, nullptr);
+
+  return forward_unified(hidden_states, batch_meta, next_tokens);
+}
+
+base::Status Model::forward_prefill(
+    int32_t seq_id,
+    const tensor::Tensor& hidden_states,
+    int32_t start_pos,
+    int32_t& next_token) const {
+  CHECK(kv_cache_manager_ != nullptr) << "KV cache manager not initialized";
+
+  std::shared_ptr<base::DeviceAllocator> alloc_gpu;
+  std::shared_ptr<base::DeviceAllocator> alloc_cpu;
+  if (device_type_ == base::DeviceType::kDeviceCUDA) {
+    alloc_gpu = base::CUDADeviceAllocatorFactory::get_instance();
+  } else {
+    alloc_gpu = base::CPUDeviceAllocatorFactory::get_instance();
+  }
+  alloc_cpu = base::CPUDeviceAllocatorFactory::get_instance();
+
+  // Calculate num_tokens from hidden_states shape
+  int32_t num_tokens = static_cast<int32_t>(hidden_states.size() / config_->dim_);
+  int32_t context_len = start_pos + num_tokens;
+
+  auto batch_meta = base::BatchMetadata::create_prefill(
+      seq_id, start_pos, num_tokens, context_len, alloc_gpu, alloc_cpu, nullptr);
+
+  std::vector<int32_t> next_tokens;
+  auto status = forward_unified(hidden_states, batch_meta, next_tokens);
+  if (!status) {
+    return status;
+  }
+
+  if (!next_tokens.empty()) {
+    next_token = next_tokens[0];
+  }
+  return base::error::Success();
+}
+
+int32_t Model::compute_chunk_size(int32_t total_tokens, int32_t num_active_seqs) const {
+  // Default chunk size limits
+  constexpr int32_t kMinChunkSize = 16;
+  constexpr int32_t kMaxChunkSize = 2048;
+  constexpr int32_t kDefaultChunkSize = 512;
+
+  if (total_tokens <= kMinChunkSize) {
+    return total_tokens;
+  }
+
+  // Estimate memory per token (rough approximation)
+  // Each token needs: Q, K, V, intermediate buffers
+  // Q: dim, K: kv_dim, V: kv_dim, MHA output: dim, FFN intermediates: 2*hidden_dim
+  size_t per_token_floats = config_->dim_ * 4 +      // Q, attn_out, rms_out, w2_out
+                            config_->kv_dim_ * 2 +   // K, V
+                            config_->hidden_dim_ * 2; // W1, W3 outputs
+  size_t per_token_bytes = per_token_floats * sizeof(float);
+
+  // Try to get available GPU memory (simplified - in practice would use cudaMemGetInfo)
+  // For now, use a conservative estimate based on typical GPU memory
+  size_t available_memory = 2ULL * 1024 * 1024 * 1024;  // Assume 2GB available for activations
+
+  // Account for other sequences in the batch
+  size_t memory_for_prefill = available_memory / (num_active_seqs > 0 ? num_active_seqs : 1);
+
+  int32_t max_tokens_by_memory = static_cast<int32_t>(memory_for_prefill / per_token_bytes);
+
+  // Clamp to reasonable range
+  int32_t chunk_size = std::min({
+      total_tokens,
+      max_tokens_by_memory,
+      kMaxChunkSize
+  });
+
+  chunk_size = std::max(chunk_size, kMinChunkSize);
+
+  // Round down to multiple of 16 for better GPU efficiency
+  chunk_size = (chunk_size / 16) * 16;
+  if (chunk_size == 0) chunk_size = kMinChunkSize;
+
+  return chunk_size;
+}
+
+base::Status Model::forward_chunked_prefill(
+    int32_t seq_id,
+    const tensor::Tensor& hidden_states,
+    int32_t start_pos,
+    int32_t chunk_size,
+    int32_t& next_token) const {
+  CHECK(kv_cache_manager_ != nullptr) << "KV cache manager not initialized";
+
+  int32_t total_tokens = static_cast<int32_t>(hidden_states.size() / config_->dim_);
+
+  if (total_tokens == 0) {
+    return base::error::InvalidArgument("Empty hidden_states for chunked prefill");
+  }
+
+  // Auto-compute chunk size if not specified
+  if (chunk_size <= 0) {
+    chunk_size = compute_chunk_size(total_tokens, 1);
+  }
+
+  std::shared_ptr<base::DeviceAllocator> alloc_gpu;
+  std::shared_ptr<base::DeviceAllocator> alloc_cpu;
+  if (device_type_ == base::DeviceType::kDeviceCUDA) {
+    alloc_gpu = base::CUDADeviceAllocatorFactory::get_instance();
+  } else {
+    alloc_gpu = base::CPUDeviceAllocatorFactory::get_instance();
+  }
+  alloc_cpu = base::CPUDeviceAllocatorFactory::get_instance();
+
+  int32_t current_pos = start_pos;
+  int32_t tokens_processed = 0;
+
+  while (tokens_processed < total_tokens) {
+    int32_t tokens_this_chunk = std::min(chunk_size, total_tokens - tokens_processed);
+
+    // Create a view into hidden_states for this chunk
+    // hidden_states: [total_tokens, dim]
+    // chunk: [tokens_this_chunk, dim]
+    int32_t offset = tokens_processed * config_->dim_;
+    tensor::Tensor chunk_hidden(base::DataType::kDataTypeFp32,
+                                tokens_this_chunk, config_->dim_,
+                                false, nullptr,
+                                const_cast<float*>(hidden_states.ptr<float>(offset)));
+    chunk_hidden.set_device_type(device_type_);
+
+    // Allocate blocks for this chunk
+    int32_t context_len_after_chunk = current_pos + tokens_this_chunk;
+    if (!kv_cache_manager_->allocate_blocks_for_tokens(seq_id, context_len_after_chunk)) {
+      return base::error::InternalError("Failed to allocate KV cache blocks for prefill chunk");
+    }
+
+    // Create batch metadata for this chunk
+    auto batch_meta = base::BatchMetadata::create_prefill(
+        seq_id, current_pos, tokens_this_chunk, context_len_after_chunk,
+        alloc_gpu, alloc_cpu, nullptr);
+
+    // Forward this chunk
+    std::vector<int32_t> chunk_next_tokens;
+    auto status = forward_unified(chunk_hidden, batch_meta, chunk_next_tokens);
+    if (!status) {
+      return status;
+    }
+
+    // Update position
+    current_pos += tokens_this_chunk;
+    tokens_processed += tokens_this_chunk;
+
+    // Keep the last token prediction
+    if (!chunk_next_tokens.empty()) {
+      next_token = chunk_next_tokens[0];
+    }
+  }
+
+  // Update context length in KV cache manager
+  kv_cache_manager_->set_context_len(seq_id, current_pos);
+
+  return base::error::Success();
+}
+
 }  // namespace model
\ No newline at end of file
diff --git a/kuiper/source/model/model_paged.cpp b/kuiper/source/model/model_paged.cpp
new file mode 100644
index 0000000..b20b709
--- /dev/null
+++ b/kuiper/source/model/model_paged.cpp
@@ -0,0 +1,261 @@
+#include "model/model.h"
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+namespace model {
+Model::Model(base::TokenizerType tokenizer_type, base::ModelType model_type, std::string token_path,
+             std::string model_path, bool is_quant_model)
+    : tokenizer_type_(tokenizer_type),
+      model_type_(model_type),
+      token_path_(std::move(token_path)),
+      model_path_(std::move(model_path)),
+      is_quant_model_(is_quant_model) {}
+
+base::ModelType Model::model_type() const { return model_type_; }
+
+const std::string& Model::token_path() const { return token_path_; }
+
+const std::string& Model::model_path() const { return model_path_; }
+
+base::Status Model::insert_buffer(ModelBufferType buffer_idx, const tensor::Tensor& tensor) {
+  if (buffers_.count(buffer_idx) > 0) {
+    return base::error::KeyHasExits(std::to_string(int(buffer_idx)) + " has exits in the buffers");
+  }
+  if (tensor.is_empty()) {
+    return base::error::InvalidArgument("The tensor is empty for inserting buffer.");
+  }
+  buffers_.insert({buffer_idx, tensor});
+  return base::error::Success();
+}
+
+tensor::Tensor& Model::get_buffer(ModelBufferType buffer_idx) {
+  CHECK_GT(buffers_.count(buffer_idx), 0) << int(buffer_idx);
+  return buffers_.at(buffer_idx);
+}
+
+const tensor::Tensor& Model::get_buffer(ModelBufferType buffer_idx) const {
+  CHECK_GT(buffers_.count(buffer_idx), 0);
+  return buffers_.at(buffer_idx);
+}
+
+base::Status Model::read_model_file() {
+  using namespace base;
+  if (model_path_.empty()) {
+    return error::PathNotValid("Failed to open the weight file, the model path is empty!");
+  }
+  int32_t fd = open(model_path_.data(), O_RDONLY);
+  if (fd == -1) {
+    return error::PathNotValid("Failed to open the weight file " + model_path_ +
+                               " may be the path does not exist!");
+  }
+
+  FILE* file = fopen(model_path_.data(), "rb");
+  if (!file) {
+    return error::PathNotValid("Failed to open the file. The path may be invalid.");
+  }
+
+  auto config = ModelConfig{};
+  if (fread(&config, sizeof(ModelConfig), 1, file) != 1) {
+    return error::ModelParseError(
+        "Failed to retrieve the configuration information from the model "
+        "file.");
+  }
+  if (is_quant_model_) {
+    if (fread(&group_size_, sizeof(int32_t), 1, file) != 1) {
+      return error::ModelParseError(
+          "Failed to retrieve the group size information from the model "
+          "file.");
+    }
+  }
+
+  auto gen_status = generate_model_infos(config);
+  if (!gen_status) {
+    return gen_status;
+  }
+
+  if (!is_quant_model_) {
+    raw_model_data_ = std::make_shared<RawModelDataFp32>();
+  } else {
+    raw_model_data_ = std::make_shared<RawModelDataInt8>();
+  }
+
+  struct stat sb;
+  if (fstat(fd, &sb) == -1) {
+    close(fd);
+    return error::ModelParseError(
+        "Failed to retrieve the file size information from the model "
+        "file.");
+  }
+  raw_model_data_->file_size = sb.st_size;
+
+  raw_model_data_->fd = fd;
+  raw_model_data_->data =
+      mmap(nullptr, raw_model_data_->file_size, PROT_READ, MAP_PRIVATE, raw_model_data_->fd, 0);
+
+  if (raw_model_data_->data == MAP_FAILED || raw_model_data_->data == nullptr) {
+    return error::ModelParseError("Failed to map the weight file " + model_path_ + " into memory.");
+  }
+  if (!is_quant_model_) {
+    raw_model_data_->weight_data =
+        static_cast<int8_t*>(raw_model_data_->data) + sizeof(ModelConfig);
+  } else {
+    raw_model_data_->weight_data =
+        static_cast<int8_t*>(raw_model_data_->data) + sizeof(ModelConfig) + sizeof(group_size_);
+  }
+  if (raw_model_data_ == nullptr) {
+    LOG(ERROR);
+    return error::ModelParseError("Failed to map the weight file " + model_path_ +
+                                  " into memory, the pointer to weight start address is null");
+  }
+  return error::Success();
+}
+
+base::Status Model::generate_model_infos(const ModelConfig& config) const {
+  config_->dim_ = config.dim;
+  config_->hidden_dim_ = config.hidden_dim;
+  config_->layer_num_ = config.layer_num;
+  config_->head_num_ = config.head_num;
+  config_->kv_head_num_ = config.kv_head_num;
+  config_->seq_len_ = config.seq_len;
+
+  config_->kv_dim_ = (config.dim * config.kv_head_num) / config.head_num;
+  config_->kv_mul_ = config.head_num / config.kv_head_num;
+  config_->head_size_ = config.dim / config.head_num;
+#if defined(QWEN3_SUPPORT)
+  config_->immediate_dim_ = config.immediate_dim_;
+#endif
+  if (config.vocab_size > 0) {
+    config_->is_shared_weight_ = true;
+  } else {
+    config_->is_shared_weight_ = false;
+  }
+
+  // Qwen tokenizer size and embedding size is mismatched
+  // refer: https://github.com/QwenLM/Qwen2.5/issues/29
+  // if (std::abs(config.vocab_size) != config_->vocab_size_) {
+  //   return base::error::ModelParseError(
+  //       "Vocabulary size mismatch between the model file and the token list.");
+  // }
+  config_->vocab_size_ = std::abs(config.vocab_size);
+  return base::error::Success();
+}
+
+base::Status Model::create_encode_layer() {
+  using namespace base;
+
+  // create token encode decode layer
+  if (tokenizer_type_ == TokenizerType::kEncodeSpe) {
+    encode_layer_ = std::make_unique<op::SpeEncodeLayer>(this->token_path_, true, false);
+  } else {
+#ifdef LLAMA3_SUPPORT
+    encode_layer_ = std::make_unique<op::BpeEncodeLayer>(this->token_path_, true, false);
+#endif
+
+#if defined(QWEN2_SUPPORT) || defined(QWEN3_SUPPORT)
+    encode_layer_ = std::make_unique<op::QwenEncodeLayer>(this->token_path_, false, false);
+#endif
+  }
+  if (!encode_layer_) {
+    return error::InternalError("Create the encode layer failed.");
+  }
+
+  config_->vocab_size_ = encode_layer_->vocab_size();
+  if (config_->vocab_size_ <= 0) {
+    return error::InternalError("The vocab size param read error from the model file!");
+  }
+  return error::Success();
+}
+
+base::Status Model::gen_model_from_file() {
+  using namespace base;
+  config_ = std::make_unique<TransformerConfig>();
+
+  // init sentence piece processor
+  // google sentence piece
+  auto create_encode_status = create_encode_layer();
+  if (!create_encode_status) {
+    LOG(ERROR) << "Create the encode layer failed!";
+    return create_encode_status;
+  }
+  // mmap
+  auto mmap_status = read_model_file();
+  if (!mmap_status) {
+    LOG(ERROR) << "Handle model file " << model_path_ << " failed!";
+    return mmap_status;
+  }
+  auto layer_create_status = create_layers();
+  if (!layer_create_status) {
+    LOG(ERROR) << "Create layers for the model file " << model_path_ << " failed!";
+    return layer_create_status;
+  }
+
+  return error::Success();
+}
+
+std::vector<int32_t> Model::encode(const std::string& sentence) const {
+  CHECK(encode_layer_ != nullptr);
+  return encode_layer_->encode(sentence);
+}
+
+bool Model::is_sentence_ending(int32_t token_idx) const {
+  CHECK(this->encode_layer_ != nullptr);
+  return this->encode_layer_->is_sentence_ending(token_idx);
+}
+
+std::string Model::decode(int32_t token_idx) const {
+  CHECK(this->encode_layer_ != nullptr);
+  return this->encode_layer_->decode(token_idx);
+}
+
+std::string Model::decode(std::vector<int32_t> token_idxs) const {
+  CHECK(this->encode_layer_ != nullptr);
+  return this->encode_layer_->decode(token_idxs);
+}
+
+std::pair<tensor::Tensor, tensor::Tensor> Model::slice_kv_cache(int32_t layer_idx,
+                                                                int32_t token_pos) const {
+  int32_t layer_offset = layer_idx * config_->seq_len_ * config_->kv_dim_;
+  int32_t cache_offset = layer_offset + token_pos * config_->kv_dim_;
+
+  float* key_cache_ptr =
+      const_cast<float*>(get_buffer(ModelBufferType::kKeyCache).ptr<float>(cache_offset));
+  float* val_cache_ptr =
+      const_cast<float*>(get_buffer(ModelBufferType::kValueCache).ptr<float>(cache_offset));
+
+  tensor::Tensor key(base::DataType::kDataTypeFp32, config_->kv_dim_, false, nullptr,
+                     key_cache_ptr);
+  tensor::Tensor val(base::DataType::kDataTypeFp32, config_->kv_dim_, false, nullptr,
+                     val_cache_ptr);
+  key.set_device_type(device_type_);
+  val.set_device_type(device_type_);
+  return {key, val};
+}
+
+tensor::Tensor Model::fill_input(const tensor::Tensor& pos_tensor,
+                                 const op::EmbeddingOutput& embedding_output,
+                                 bool is_prompt) const {
+  const int32_t pos = pos_tensor.index<int32_t>(0);
+  auto [input_tokens, input_embeddings, input_token_num] = embedding_output;
+
+  int32_t index = 0;
+  if (is_prompt) {
+    index = pos;
+  }
+#if defined(QWEN3_SUPPORT)
+  std::shared_ptr<base::Buffer> input_emb_buffer = std::make_shared<base::Buffer>(
+      config_->hidden_dim_ * sizeof(float), nullptr,
+      input_embeddings.ptr<float>(index * config_->hidden_dim_), true);
+  tensor::Tensor input(base::DataType::kDataTypeFp32, config_->hidden_dim_);
+
+#else
+  std::shared_ptr<base::Buffer> input_emb_buffer =
+      std::make_shared<base::Buffer>(config_->dim_ * sizeof(float), nullptr,
+                                     input_embeddings.ptr<float>(index * config_->dim_), true);
+  tensor::Tensor input(base::DataType::kDataTypeFp32, config_->dim_);
+#endif
+  input.assign(input_emb_buffer);
+  input.set_device_type(device_type_);
+  return input;
+}
+
+}  // namespace model
\ No newline at end of file
diff --git a/kuiper/source/op/kernels/cuda/gemm_kernel.cu b/kuiper/source/op/kernels/cuda/gemm_kernel.cu
new file mode 100644
index 0000000..b78f11b
--- /dev/null
+++ b/kuiper/source/op/kernels/cuda/gemm_kernel.cu
@@ -0,0 +1,322 @@
+#include <cuda_runtime.h>
+#include <glog/logging.h>
+#include "gemm_kernel.cuh"
+#include "matmul_kernel.cuh"
+
+namespace kernel {
+
+// ============================================================================
+// Tiled GEMM Kernel Configuration
+// ============================================================================
+// Block tile size: each thread block computes BLOCK_M x BLOCK_N output elements
+// Thread tile size: each thread computes THREAD_M x THREAD_N output elements
+// K dimension is processed in chunks of BLOCK_K
+
+constexpr int BLOCK_M = 64;     // Block tile size in M dimension
+constexpr int BLOCK_N = 64;     // Block tile size in N dimension
+constexpr int BLOCK_K = 32;     // Block tile size in K dimension
+constexpr int THREAD_M = 4;     // Thread tile size in M dimension
+constexpr int THREAD_N = 4;     // Thread tile size in N dimension
+
+// Derived constants
+constexpr int THREADS_M = BLOCK_M / THREAD_M;  // 16 threads in M dimension
+constexpr int THREADS_N = BLOCK_N / THREAD_N;  // 16 threads in N dimension
+constexpr int THREADS_PER_BLOCK = THREADS_M * THREADS_N;  // 256 threads total
+
+// ============================================================================
+// Tiled GEMM Kernel
+// ============================================================================
+// Computes C[M,N] = A[M,K] x B[N,K]^T where B is stored as [N,K]
+//
+// Memory Layout:
+//   A: row-major [M, K]
+//   B: row-major [N, K] (logically transposed, so B[n,k] accesses weight[n*K+k])
+//   C: row-major [M, N]
+//
+// Optimization techniques:
+//   1. Shared memory tiling to reduce global memory access
+//   2. Thread-level tiling for register reuse
+//   3. Vectorized loads (float4) where possible
+//   4. Loop unrolling for better instruction-level parallelism
+
+template <int BM, int BN, int BK, int TM, int TN>
+__global__ void gemm_tiled_kernel(
+    const float* __restrict__ A,  // [M, K]
+    const float* __restrict__ B,  // [N, K] - weight matrix (transposed storage)
+    float* __restrict__ C,        // [M, N]
+    const int M, const int K, const int N) {
+
+  // Block and thread indices
+  const int bx = blockIdx.x;   // Block index in N dimension
+  const int by = blockIdx.y;   // Block index in M dimension
+  const int tx = threadIdx.x;  // Thread index in N dimension (0 to THREADS_N-1)
+  const int ty = threadIdx.y;  // Thread index in M dimension (0 to THREADS_M-1)
+
+  // Starting position of this block's output tile in C
+  const int c_row_start = by * BM;
+  const int c_col_start = bx * BN;
+
+  // Shared memory for A and B tiles
+  __shared__ float As[BM][BK];
+  __shared__ float Bs[BK][BN];
+
+  // Register accumulator for this thread's output tile (TM x TN elements)
+  float c_reg[TM][TN] = {{0.0f}};
+
+  // This thread's position within the block's output tile
+  const int thread_row = ty * TM;  // Starting row in block tile
+  const int thread_col = tx * TN;  // Starting col in block tile
+
+  // Linear thread ID for cooperative loading
+  const int tid = ty * (BN / TN) + tx;
+  constexpr int NUM_THREADS = (BM / TM) * (BN / TN);
+
+  // Calculate how many elements each thread loads for A and B tiles
+  constexpr int A_TILE_ELEMENTS = BM * BK;
+  constexpr int B_TILE_ELEMENTS = BK * BN;
+  constexpr int A_LOADS_PER_THREAD = (A_TILE_ELEMENTS + NUM_THREADS - 1) / NUM_THREADS;
+  constexpr int B_LOADS_PER_THREAD = (B_TILE_ELEMENTS + NUM_THREADS - 1) / NUM_THREADS;
+
+  // Iterate over K dimension in chunks of BK
+  for (int k_start = 0; k_start < K; k_start += BK) {
+
+    // ========== Load A tile into shared memory ==========
+    // A tile: [BM, BK] from A[c_row_start:c_row_start+BM, k_start:k_start+BK]
+    #pragma unroll
+    for (int load_idx = 0; load_idx < A_LOADS_PER_THREAD; ++load_idx) {
+      int elem_idx = tid + load_idx * NUM_THREADS;
+      if (elem_idx < A_TILE_ELEMENTS) {
+        int a_tile_row = elem_idx / BK;
+        int a_tile_col = elem_idx % BK;
+        int global_row = c_row_start + a_tile_row;
+        int global_col = k_start + a_tile_col;
+
+        if (global_row < M && global_col < K) {
+          As[a_tile_row][a_tile_col] = A[global_row * K + global_col];
+        } else {
+          As[a_tile_row][a_tile_col] = 0.0f;
+        }
+      }
+    }
+
+    // ========== Load B tile into shared memory ==========
+    // B is stored as [N, K], we load B[c_col_start:c_col_start+BN, k_start:k_start+BK]
+    // and store it transposed as Bs[BK, BN] for coalesced access during computation
+    #pragma unroll
+    for (int load_idx = 0; load_idx < B_LOADS_PER_THREAD; ++load_idx) {
+      int elem_idx = tid + load_idx * NUM_THREADS;
+      if (elem_idx < B_TILE_ELEMENTS) {
+        int b_tile_row = elem_idx / BN;  // K dimension (0 to BK-1)
+        int b_tile_col = elem_idx % BN;  // N dimension (0 to BN-1)
+        int global_k = k_start + b_tile_row;
+        int global_n = c_col_start + b_tile_col;
+
+        if (global_k < K && global_n < N) {
+          // B[N, K] -> Bs[K, N]: B[global_n, global_k] = B[global_n * K + global_k]
+          Bs[b_tile_row][b_tile_col] = B[global_n * K + global_k];
+        } else {
+          Bs[b_tile_row][b_tile_col] = 0.0f;
+        }
+      }
+    }
+
+    __syncthreads();
+
+    // ========== Compute partial results ==========
+    // Each thread computes its TM x TN output elements
+    #pragma unroll
+    for (int k = 0; k < BK; ++k) {
+      // Load A values for this thread's rows into registers
+      float a_reg[TM];
+      #pragma unroll
+      for (int m = 0; m < TM; ++m) {
+        a_reg[m] = As[thread_row + m][k];
+      }
+
+      // Load B values for this thread's columns into registers
+      float b_reg[TN];
+      #pragma unroll
+      for (int n = 0; n < TN; ++n) {
+        b_reg[n] = Bs[k][thread_col + n];
+      }
+
+      // Outer product accumulation
+      #pragma unroll
+      for (int m = 0; m < TM; ++m) {
+        #pragma unroll
+        for (int n = 0; n < TN; ++n) {
+          c_reg[m][n] += a_reg[m] * b_reg[n];
+        }
+      }
+    }
+
+    __syncthreads();
+  }
+
+  // ========== Write results to global memory ==========
+  #pragma unroll
+  for (int m = 0; m < TM; ++m) {
+    #pragma unroll
+    for (int n = 0; n < TN; ++n) {
+      int global_row = c_row_start + thread_row + m;
+      int global_col = c_col_start + thread_col + n;
+
+      if (global_row < M && global_col < N) {
+        C[global_row * N + global_col] = c_reg[m][n];
+      }
+    }
+  }
+}
+
+// ============================================================================
+// Batched GEMV Kernel for Decode Phase
+// ============================================================================
+// For decode phase: each sequence has 1 token, but we have multiple sequences
+// Input: [batch_size, in_dim], Weight: [out_dim, in_dim], Output: [batch_size, out_dim]
+
+template <int THREADS>
+__global__ void gemv_batched_kernel(
+    const float* __restrict__ input,   // [batch_size, in_dim]
+    const float* __restrict__ weight,  // [out_dim, in_dim]
+    float* __restrict__ output,        // [batch_size, out_dim]
+    const int batch_size,
+    const int in_dim,
+    const int out_dim) {
+
+  // Grid: (out_dim, batch_size) - each block computes one output element for one sequence
+  const int out_idx = blockIdx.x;
+  const int batch_idx = blockIdx.y;
+  const int tid = threadIdx.x;
+
+  if (batch_idx >= batch_size || out_idx >= out_dim) return;
+
+  // Pointers to this sequence's input and weight row
+  const float* x = input + batch_idx * in_dim;
+  const float* w = weight + out_idx * in_dim;
+
+  // Shared memory for partial sums
+  __shared__ float sdata[THREADS];
+  float sum = 0.0f;
+
+  // Vectorized load using float4 for better memory bandwidth
+  const int vec_size = 4;
+  const int vec_num = in_dim / vec_size;
+  const float4* x_vec = reinterpret_cast<const float4*>(x);
+  const float4* w_vec = reinterpret_cast<const float4*>(w);
+
+  // Process vectorized portion
+  for (int i = tid; i < vec_num; i += THREADS) {
+    float4 xv = x_vec[i];
+    float4 wv = w_vec[i];
+    sum += xv.x * wv.x + xv.y * wv.y + xv.z * wv.z + xv.w * wv.w;
+  }
+
+  // Process remaining elements
+  const int vec_offset = vec_num * vec_size;
+  for (int i = vec_offset + tid; i < in_dim; i += THREADS) {
+    sum += x[i] * w[i];
+  }
+
+  sdata[tid] = sum;
+  __syncthreads();
+
+  // Block reduction
+  for (int s = THREADS / 2; s > 0; s >>= 1) {
+    if (tid < s) {
+      sdata[tid] += sdata[tid + s];
+    }
+    __syncthreads();
+  }
+
+  // Write result
+  if (tid == 0) {
+    output[batch_idx * out_dim + out_idx] = sdata[0];
+  }
+}
+
+// ============================================================================
+// Public Interface Functions
+// ============================================================================
+
+void gemm_kernel_cu(const tensor::Tensor& input, const tensor::Tensor& weight,
+                    const tensor::Tensor& output, const CudaConfig* config) {
+  CHECK(!input.is_empty()) << "Input tensor is empty";
+  CHECK(!weight.is_empty()) << "Weight tensor is empty";
+  CHECK(input.device_type() == base::DeviceType::kDeviceCUDA) << "Input must be on CUDA";
+  CHECK(weight.device_type() == base::DeviceType::kDeviceCUDA) << "Weight must be on CUDA";
+
+  // Determine dimensions
+  // Input can be [num_tokens, in_dim] or [in_dim] for single token
+  int M, K, N;
+  if (input.dims_size() == 1) {
+    // Single token case: [in_dim]
+    M = 1;
+    K = input.get_dim(0);
+  } else {
+    // Multi-token case: [num_tokens, in_dim]
+    M = input.get_dim(0);
+    K = input.get_dim(1);
+  }
+
+  // Weight is [out_dim, in_dim]
+  N = weight.get_dim(0);
+  CHECK_EQ(weight.get_dim(1), K) << "Weight dimension mismatch";
+
+  // For single token with 1D input, fall back to existing optimized GEMV
+  // Note: only fallback when input is 1D [dim], not 2D [1, dim]
+  if (M == 1 && input.dims_size() == 1) {
+    matmul_kernel_cu(input, weight, output, 1.0f, config);
+    return;
+  }
+
+  // Multi-token: use tiled GEMM
+  dim3 block(THREADS_N, THREADS_M);  // (16, 16) = 256 threads
+  dim3 grid((N + BLOCK_N - 1) / BLOCK_N, (M + BLOCK_M - 1) / BLOCK_M);
+
+  cudaStream_t stream = config ? config->stream : nullptr;
+
+  gemm_tiled_kernel<BLOCK_M, BLOCK_N, BLOCK_K, THREAD_M, THREAD_N>
+      <<<grid, block, 0, stream>>>(
+          input.ptr<float>(),
+          weight.ptr<float>(),
+          const_cast<float*>(output.ptr<float>()),
+          M, K, N);
+}
+
+void gemv_batched_kernel_cu(const tensor::Tensor& input, const tensor::Tensor& weight,
+                            const tensor::Tensor& output, const CudaConfig* config) {
+  CHECK(!input.is_empty()) << "Input tensor is empty";
+  CHECK(!weight.is_empty()) << "Weight tensor is empty";
+  CHECK(input.device_type() == base::DeviceType::kDeviceCUDA) << "Input must be on CUDA";
+  CHECK(weight.device_type() == base::DeviceType::kDeviceCUDA) << "Weight must be on CUDA";
+  CHECK_EQ(input.dims_size(), 2) << "Input must be 2D [batch_size, in_dim]";
+
+  const int batch_size = input.get_dim(0);
+  const int in_dim = input.get_dim(1);
+  const int out_dim = weight.get_dim(0);
+
+  CHECK_EQ(weight.get_dim(1), in_dim) << "Weight dimension mismatch";
+
+  // For small batch sizes, use custom batched GEMV kernel
+  // For larger batches, GEMM might be more efficient
+  constexpr int BATCH_THRESHOLD = 16;
+
+  if (batch_size <= BATCH_THRESHOLD) {
+    constexpr int THREADS = 256;
+    dim3 grid(out_dim, batch_size);
+    dim3 block(THREADS);
+
+    cudaStream_t stream = config ? config->stream : nullptr;
+
+    gemv_batched_kernel<THREADS><<<grid, block, 0, stream>>>(
+        input.ptr<float>(),
+        weight.ptr<float>(),
+        const_cast<float*>(output.ptr<float>()),
+        batch_size, in_dim, out_dim);
+  } else {
+    // Fall back to GEMM for larger batches
+    gemm_kernel_cu(input, weight, output, config);
+  }
+}
+
+}  // namespace kernel
diff --git a/kuiper/source/op/kernels/cuda/gemm_kernel.cuh b/kuiper/source/op/kernels/cuda/gemm_kernel.cuh
new file mode 100644
index 0000000..44c0a93
--- /dev/null
+++ b/kuiper/source/op/kernels/cuda/gemm_kernel.cuh
@@ -0,0 +1,42 @@
+#ifndef GEMM_KERNEL_CU_CUH
+#define GEMM_KERNEL_CU_CUH
+#include "../kernels_interface.h"
+#include "tensor/tensor.h"
+
+namespace kernel {
+
+/**
+ * Tiled GEMM kernel for multi-token matrix multiplication
+ *
+ * Computes: C[M, N] = A[M, K] × B[K, N]^T
+ * Where B is stored as [N, K] (row-major, transposed)
+ *
+ * For LLM inference:
+ *   A = Input    [num_tokens, in_dim]
+ *   B = Weight   [out_dim, in_dim]  (stored transposed)
+ *   C = Output   [num_tokens, out_dim]
+ *
+ * @param input   Input tensor [num_tokens, in_dim] or [in_dim] for single token
+ * @param weight  Weight tensor [out_dim, in_dim]
+ * @param output  Output tensor [num_tokens, out_dim] or [out_dim] for single token
+ * @param config  CUDA configuration (stream, etc.)
+ */
+void gemm_kernel_cu(const tensor::Tensor& input, const tensor::Tensor& weight,
+                    const tensor::Tensor& output, const CudaConfig* config = nullptr);
+
+/**
+ * Batched GEMV kernel for multi-sequence decode phase
+ * Each sequence has exactly 1 token
+ *
+ * @param input   Input tensor [batch_size, in_dim]
+ * @param weight  Weight tensor [out_dim, in_dim]
+ * @param output  Output tensor [batch_size, out_dim]
+ * @param config  CUDA configuration
+ */
+void gemv_batched_kernel_cu(const tensor::Tensor& input, const tensor::Tensor& weight,
+                            const tensor::Tensor& output, const CudaConfig* config = nullptr); 
+                            
+
+}  // namespace kernel
+
+#endif  // GEMM_KERNEL_CU_CUH
diff --git a/kuiper/source/op/kernels/cuda/paged_attention_kernel.cu b/kuiper/source/op/kernels/cuda/paged_attention_kernel.cu
new file mode 100644
index 0000000..d51941c
--- /dev/null
+++ b/kuiper/source/op/kernels/cuda/paged_attention_kernel.cu
@@ -0,0 +1,628 @@
+/**
+ * @author    jintang-coder
+ *   - PagedAttention: GPU kernels for paged KV cache attention
+ *
+ * =============================================================================
+ * PagedAttention CUDA Kernels
+ * =============================================================================
+ *
+ * 功能: 在分页KV Cache上执行注意力计算，支持Decode/Prefill/Mixed批次
+ *
+ * KV Cache 内存布局:
+ *   kv_cache[Total_Blocks, Num_Layers, 2(K/V), Block_Size, KV_Dim]
+ *
+ *   访问某个token的Key:
+ *     physical_block = block_tables[seq_idx][logical_block]
+ *     offset = physical_block * block_stride
+ *            + layer_idx * layer_stride
+ *            + 0 * kv_plane_stride  // K plane
+ *            + token_in_block * kv_dim
+ *
+ * ---
+ * Kernel 1: paged_attention_kernel (Decode专用)
+ *
+ *   Grid: (num_seqs, num_heads)
+ *   Block: 128 threads
+ *
+ *   每个block处理一个(seq, head)对:
+ *     1. 加载query[seq][head]
+ *     2. 遍历该seq的所有KV blocks
+ *     3. 计算attention scores (online softmax)
+ *     4. 输出加权value
+ *
+ *   例子: seq0有50个tokens，block_size=16
+ *     num_logical_blocks = ceil(50/16) = 4
+ *     遍历 block_tables[0][0..3] 获取物理块ID
+ *     最后一个块只有 50%16=2 个有效token
+ *
+ * ---
+ * Kernel 2: batch_kv_cache_write_kernel (Decode写入)
+ *
+ *   Grid: (num_seqs)
+ *   Block: 256 threads
+ *
+ *   每个block写入一个序列的新KV:
+ *     seq_idx = blockIdx.x
+ *     token_pos = positions[seq_idx]
+ *     logical_block = token_pos / block_size
+ *     physical_block = block_tables[seq_idx][logical_block]
+ *     写入 kv_cache[physical_block][layer][K/V][token_in_block]
+ *
+ * ---
+ * Kernel 3: batch_kv_cache_write_kernel_v2 (Prefill/Mixed写入)
+ *
+ *   Grid: (num_tokens)  // 注意: 按token而非seq
+ *   Block: 256 threads
+ *
+ *   关键区别: 使用 seq_ids[] 映射token到序列
+ *
+ *   例子: Mixed batch
+ *     all_tokens = [t0..t9, t10, t11..t15]  // 16 tokens
+ *     seq_ids    = [0,0,0,0,0,0,0,0,0,0, 1, 2,2,2,2,2]
+ *     positions  = [0,1,2,3,4,5,6,7,8,9, 25, 0,1,2,3,4]
+ *
+ *     token_idx=10 (seq1的decode token):
+ *       seq_idx = seq_ids[10] = 1
+ *       token_pos = positions[10] = 25
+ *       logical_block = 25/16 = 1
+ *       physical_block = block_tables[1][1]
+ *       token_in_block = 25%16 = 9
+ *
+ * ---
+ * Kernel 4: paged_attention_prefill_kernel (Prefill/Mixed注意力)
+ *
+ *   Grid: (num_tokens, num_heads)  // 每个query token一个block
+ *   Block: 128 threads
+ *
+ *   关键: Causal Mask
+ *     每个query只attend到 position <= 自己的位置
+ *
+ *   例子: Prefill seq0, positions=[0,1,2,3,4]
+ *     query at pos=0: attend to [0]
+ *     query at pos=1: attend to [0,1]
+ *     query at pos=2: attend to [0,1,2]
+ *     ...
+ *
+ *   实现:
+ *     effective_context_len = min(context_lens[seq_idx], query_pos + 1)
+ *
+ * ---
+ * Online Softmax (FlashAttention风格):
+ *
+ *   避免存储完整attention矩阵，逐token更新:
+ *
+ *   m_prev = -INF, d_prev = 0, acc_out = 0
+ *   for each key_token:
+ *       score = dot(q, k) * scale
+ *       m_new = max(m_prev, score)
+ *       exp_score = exp(score - m_new)
+ *       correction = exp(m_prev - m_new)
+ *       d_prev = d_prev * correction + exp_score
+ *       acc_out = acc_out * correction + v * exp_score
+ *       m_prev = m_new
+ *   output = acc_out / d_prev
+ *
+ * ---
+ * 数据流示意:
+ *
+ *   BatchMetadata                    GPU Kernels
+ *   ┌─────────────────┐              ┌─────────────────────────────┐
+ *   │ seq_ids[16]     │─────────────►│ batch_kv_cache_write_v2     │
+ *   │ positions[16]   │              │   写入KV到正确的物理块位置   │
+ *   └─────────────────┘              └─────────────────────────────┘
+ *                                                 │
+ *   KVCacheManager                                ▼
+ *   ┌─────────────────┐              ┌─────────────────────────────┐
+ *   │ block_tables    │─────────────►│ paged_attention_prefill     │
+ *   │ context_lens    │              │   从物理块读取KV计算attention│
+ *   │ kv_cache        │              └─────────────────────────────┘
+ *   └─────────────────┘                           │
+ *                                                 ▼
+ *                                    ┌─────────────────────────────┐
+ *                                    │ Output: [num_tokens, dim]   │
+ *                                    └─────────────────────────────┘
+ *
+ */
+
+#include <base/cuda_config.h>
+#include <tensor/tensor.h>
+#include <cfloat>
+#include <cub/cub.cuh>
+#include "paged_attention_kernel.cuh"
+#include <base/tick.h>
+
+namespace kernel {
+    // kernel 实现 的（v1  , 简化）
+
+__global__ void  paged_attention_kernel_kernel(
+    float* __restrict__ out  , // [num_seqs , num_heads , head_dim]
+    const float*  __restrict__ query , //[num_seqs][num_heads][head_dim]
+    const float*  __restrict__  key_value_cache , // [Total_Blocks, Num_Layers, 2, Block_Size, KV_Dim] (KV_Dim = num_kv_heads * head_dim)
+    const int* __restrict__ block_tables , 
+    const int* __restrict__ context_lens ,
+    const int max_context_len,
+    const int num_seqs,
+    const int num_heads,
+    const int num_kv_heads,
+    const int head_dim,
+    const int block_size,
+    const int max_blocks_per_seq,
+    const float scale, 
+    const int layer_idx , 
+    const int num_layers
+
+)  {
+        int seq_idx  = blockIdx.x; 
+        int head_idx = blockIdx.y; 
+
+        int tid = threadIdx.x;   // 对应 head_dim 中的一个元素 
+        if(seq_idx >= num_seqs || head_idx >= num_heads) return  ; 
+
+        const long long token_stride = (long long)num_kv_heads * head_dim;
+        const long long kv_plane_stride = (long long)block_size * token_stride;
+        const long long layer_stride = 2LL * kv_plane_stride;
+        const long long block_stride = (long long)num_layers * layer_stride;
+        const long long kv_head_idx = head_idx / (num_heads / num_kv_heads);
+        const long long layer_offset = layer_stride * layer_idx;
+
+        float q_val = 0.0f;
+        if (tid < head_dim) {
+          q_val = query[(long long)seq_idx * num_heads * head_dim + (long long)head_idx * head_dim + tid];
+        }
+
+        using BlockReduce = cub::BlockReduce<float, 128>;
+        __shared__ typename BlockReduce::TempStorage temp_storage;
+        __shared__ float shared_score;
+        float m_prev  = -FLT_MAX;   
+        float d_prev =  0.0f ; 
+        float acc_out = 0.0f ; 
+        int context_len = context_lens[seq_idx]; 
+        int num_logical_blocks = (context_len - 1) / block_size + 1;
+        for (int i = 0 ; i<num_logical_blocks ; i++) {
+            int physical_block_id = block_tables[seq_idx * max_blocks_per_seq +  i ] ; 
+            int valid_tokens  = block_size  ; 
+            if(i == num_logical_blocks - 1) {
+                valid_tokens = context_len %  block_size ;
+                if(!valid_tokens) valid_tokens  = block_size; 
+            }
+
+            for (int t  =0 ;  t<valid_tokens  ; t++) {
+                const long long offset =
+                    (long long)physical_block_id * block_stride +
+                    layer_offset +
+                    (long long)t * token_stride +
+                    kv_head_idx * head_dim +
+                    tid;
+
+                float dot = 0.0f;
+                if (tid < head_dim) {
+                  float k_val = key_value_cache[offset];
+                  dot = q_val * k_val;
+                }
+
+                float dot_sum = BlockReduce(temp_storage).Sum(dot);
+                __syncthreads();
+                if (tid == 0) {
+                  shared_score = dot_sum * scale;
+                }
+                __syncthreads();
+                float score = shared_score;
+                float m_new = max(m_prev  , score) ; 
+                float exp_score = expf(score  - m_new);  
+                float correction = expf(m_prev - m_new ) ; 
+                d_prev = d_prev * correction + exp_score ; 
+                float v_val = 0.0f;
+                if (tid < head_dim) {
+                  v_val = key_value_cache[offset + kv_plane_stride];
+                }
+                acc_out = acc_out * correction + v_val * exp_score ; 
+                m_prev = m_new ; 
+            }
+            
+        }     
+        if (tid < head_dim) {
+          float final_out = acc_out / d_prev;
+          out[(long long)seq_idx * num_heads * head_dim + (long long)head_idx * head_dim + tid] = final_out;
+        }
+}
+
+void paged_attention_kernel_cu(tensor::Tensor& out, const tensor::Tensor& query,
+                               const tensor::Tensor& key_value_cache,
+                               const tensor::Tensor& block_tables,
+                               const tensor::Tensor& context_lens, int max_context_len,
+                               int num_seqs, int num_heads, int num_kv_heads, int head_dim,
+                               int block_size, int max_blocks_per_seq, float scale, int layer_idx
+                               ,int num_layers,
+                               cudaStream_t stream) {
+  dim3 grid(num_seqs, num_heads);
+  dim3 block(128);
+  CHECK_EQ(out.is_empty(), false);
+  CHECK_EQ(query.is_empty(), false);
+  CHECK_EQ(key_value_cache.is_empty(), false);
+  CHECK_EQ(block_tables.is_empty(), false);
+  CHECK_EQ(context_lens.is_empty(), false);
+  paged_attention_kernel_kernel<<<grid, block, 0, stream>>>(
+      out.ptr<float>(), query.ptr<float>(), key_value_cache.ptr<float>(),
+      block_tables.ptr<int>(), context_lens.ptr<int>(), max_context_len, num_seqs, num_heads,
+      num_kv_heads, head_dim, block_size, max_blocks_per_seq, scale, layer_idx, num_layers);
+}
+
+// Batch KV cache write kernel
+// Each thread block handles one sequence, threads handle kv_dim elements
+__global__ void batch_kv_cache_write_kernel(
+    float* __restrict__ key_value_cache,  // [Total_Blocks, Num_Layers, 2, Block_Size, KV_Dim]
+    const float* __restrict__ key,        // [num_seqs, kv_dim]
+    const float* __restrict__ value,      // [num_seqs, kv_dim]
+    const int* __restrict__ block_tables, // [num_seqs, max_blocks_per_seq]
+    const int* __restrict__ positions,    // [num_seqs]
+    const int num_seqs,
+    const int kv_dim,
+    const int block_size,
+    const int max_blocks_per_seq,
+    const int layer_idx,
+    const int num_layers
+) {
+    int seq_idx = blockIdx.x;
+    int tid = threadIdx.x;
+
+    if (seq_idx >= num_seqs) return;
+
+    // Get token position for this sequence
+    int token_pos = positions[seq_idx];
+
+    // Calculate which block and position within block
+    int logical_block = token_pos / block_size;
+    int token_in_block = token_pos % block_size;
+
+    // Get physical block ID from block table
+    int physical_block_id = block_tables[seq_idx * max_blocks_per_seq + logical_block];
+
+    // Calculate strides for KV cache layout: [Total_Blocks, Num_Layers, 2, Block_Size, KV_Dim]
+    const long long token_stride = (long long)kv_dim;
+    const long long kv_plane_stride = (long long)block_size * token_stride;
+    const long long layer_stride = 2LL * kv_plane_stride;
+    const long long block_stride = (long long)num_layers * layer_stride;
+
+    // Base offset for this block, layer, and token position
+    const long long base_offset = (long long)physical_block_id * block_stride +
+                                  (long long)layer_idx * layer_stride +
+                                  (long long)token_in_block * token_stride;
+
+    // Key offset (kv_plane index 0)
+    const long long key_offset = base_offset;
+    // Value offset (kv_plane index 1)
+    const long long value_offset = base_offset + kv_plane_stride;
+
+    // Input offset for this sequence
+    const long long input_offset = (long long)seq_idx * kv_dim;
+
+    // Copy key and value elements (each thread handles multiple elements if kv_dim > blockDim.x)
+    for (int i = tid; i < kv_dim; i += blockDim.x) {
+        key_value_cache[key_offset + i] = key[input_offset + i];
+        key_value_cache[value_offset + i] = value[input_offset + i];
+    }
+}
+
+void batch_kv_cache_write_cu(
+    tensor::Tensor& key_value_cache,
+    const tensor::Tensor& key,
+    const tensor::Tensor& value,
+    const tensor::Tensor& block_tables,
+    const tensor::Tensor& positions,
+    int num_seqs,
+    int num_kv_heads,
+    int head_dim,
+    int block_size,
+    int max_blocks_per_seq,
+    int layer_idx,
+    int num_layers,
+    cudaStream_t stream
+) {
+    CHECK_EQ(key_value_cache.is_empty(), false);
+    CHECK_EQ(key.is_empty(), false);
+    CHECK_EQ(value.is_empty(), false);
+    CHECK_EQ(block_tables.is_empty(), false);
+    CHECK_EQ(positions.is_empty(), false);
+
+    int kv_dim = num_kv_heads * head_dim;
+
+    // Launch one block per sequence, 256 threads per block
+    dim3 grid(num_seqs);
+    dim3 block(256);
+
+    batch_kv_cache_write_kernel<<<grid, block, 0, stream>>>(
+        key_value_cache.ptr<float>(),
+        key.ptr<float>(),
+        value.ptr<float>(),
+        block_tables.ptr<int>(),
+        positions.ptr<int>(),
+        num_seqs,
+        kv_dim,
+        block_size,
+        max_blocks_per_seq,
+        layer_idx,
+        num_layers
+    );
+}
+
+// ============================================================================
+// Batch KV cache write kernel v2 (unified version)
+// Supports chunked prefill and continuous batching
+// Uses seq_ids to map tokens to sequences
+// ============================================================================
+__global__ void batch_kv_cache_write_kernel_v2(
+    float* __restrict__ key_value_cache,  // [Total_Blocks, Num_Layers, 2, Block_Size, KV_Dim]
+    const float* __restrict__ key,        // [num_tokens, kv_dim]
+    const float* __restrict__ value,      // [num_tokens, kv_dim]
+    const int* __restrict__ block_tables, // [num_seqs, max_blocks_per_seq]
+    const int* __restrict__ positions,    // [num_tokens]
+    const int* __restrict__ seq_ids,      // [num_tokens] - sequence index for each token
+    const int num_tokens,
+    const int kv_dim,
+    const int block_size,
+    const int max_blocks_per_seq,
+    const int layer_idx,
+    const int num_layers
+) {
+    int token_idx = blockIdx.x;
+    int tid = threadIdx.x;
+
+    if (token_idx >= num_tokens) return;
+
+    // Get sequence index from seq_ids (key difference from v1)
+    int seq_idx = seq_ids[token_idx];
+    int token_pos = positions[token_idx];
+
+    // Calculate which block and position within block
+    int logical_block = token_pos / block_size;
+    int token_in_block = token_pos % block_size;
+
+    // Get physical block ID from block table
+    int physical_block_id = block_tables[seq_idx * max_blocks_per_seq + logical_block];
+
+    // Calculate strides for KV cache layout: [Total_Blocks, Num_Layers, 2, Block_Size, KV_Dim]
+    const long long token_stride = (long long)kv_dim;
+    const long long kv_plane_stride = (long long)block_size * token_stride;
+    const long long layer_stride = 2LL * kv_plane_stride;
+    const long long block_stride = (long long)num_layers * layer_stride;
+
+    // Base offset for this block, layer, and token position
+    const long long base_offset = (long long)physical_block_id * block_stride +
+                                  (long long)layer_idx * layer_stride +
+                                  (long long)token_in_block * token_stride;
+
+    // Key offset (kv_plane index 0)
+    const long long key_offset = base_offset;
+    // Value offset (kv_plane index 1)
+    const long long value_offset = base_offset + kv_plane_stride;
+
+    // Input offset for this token
+    const long long input_offset = (long long)token_idx * kv_dim;
+
+    // Copy key and value elements
+    for (int i = tid; i < kv_dim; i += blockDim.x) {
+        key_value_cache[key_offset + i] = key[input_offset + i];
+        key_value_cache[value_offset + i] = value[input_offset + i];
+    }
+}
+
+void batch_kv_cache_write_v2_cu(
+    tensor::Tensor& key_value_cache,
+    const tensor::Tensor& key,
+    const tensor::Tensor& value,
+    const tensor::Tensor& block_tables,
+    const tensor::Tensor& positions,
+    const tensor::Tensor& seq_ids,
+    int num_tokens,
+    int num_kv_heads,
+    int head_dim,
+    int block_size,
+    int max_blocks_per_seq,
+    int layer_idx,
+    int num_layers,
+    cudaStream_t stream
+) {
+    CHECK_EQ(key_value_cache.is_empty(), false);
+    CHECK_EQ(key.is_empty(), false);
+    CHECK_EQ(value.is_empty(), false);
+    CHECK_EQ(block_tables.is_empty(), false);
+    CHECK_EQ(positions.is_empty(), false);
+    CHECK_EQ(seq_ids.is_empty(), false);
+
+    int kv_dim = num_kv_heads * head_dim;
+
+    // Launch one block per token, 256 threads per block
+    dim3 grid(num_tokens);
+    dim3 block(256);
+
+    batch_kv_cache_write_kernel_v2<<<grid, block, 0, stream>>>(
+        key_value_cache.ptr<float>(),
+        key.ptr<float>(),
+        value.ptr<float>(),
+        block_tables.ptr<int>(),
+        positions.ptr<int>(),
+        seq_ids.ptr<int>(),
+        num_tokens,
+        kv_dim,
+        block_size,
+        max_blocks_per_seq,
+        layer_idx,
+        num_layers
+    );
+}
+
+// ============================================================================
+// Paged attention prefill kernel
+// Supports multiple query tokens per sequence with causal mask
+// Each query only attends to positions <= its own position
+// ============================================================================
+__global__ void paged_attention_prefill_kernel_impl(
+    float* __restrict__ out,               // [num_tokens, num_heads, head_dim]
+    const float* __restrict__ query,       // [num_tokens, num_heads, head_dim]
+    const float* __restrict__ key_value_cache,  // [Total_Blocks, Num_Layers, 2, Block_Size, KV_Dim]
+    const int* __restrict__ block_tables,  // [num_seqs, max_blocks_per_seq]
+    const int* __restrict__ context_lens,  // [num_seqs]
+    const int* __restrict__ seq_ids,       // [num_tokens]
+    const int* __restrict__ positions,     // [num_tokens] - for causal mask
+    const int num_tokens,
+    const int num_seqs,
+    const int num_heads,
+    const int num_kv_heads,
+    const int head_dim,
+    const int block_size,
+    const int max_blocks_per_seq,
+    const float scale,
+    const int layer_idx,
+    const int num_layers
+) {
+    // Grid: (num_tokens, num_heads)
+    int token_idx = blockIdx.x;
+    int head_idx = blockIdx.y;
+    int tid = threadIdx.x;
+
+    if (token_idx >= num_tokens || head_idx >= num_heads) return;
+
+    int seq_idx = seq_ids[token_idx];
+    int query_pos = positions[token_idx];  // Current query's position
+
+    // Causal mask: only attend to positions <= query_pos
+    // Also limited by context_lens[seq_idx]
+    int context_len = context_lens[seq_idx];
+    int effective_context_len = min(context_len, query_pos + 1);
+
+    if (effective_context_len <= 0) {
+        // No keys to attend to, output zeros
+        if (tid < head_dim) {
+            out[(long long)token_idx * num_heads * head_dim + (long long)head_idx * head_dim + tid] = 0.0f;
+        }
+        return;
+    }
+
+    // Calculate strides
+    const long long token_stride = (long long)num_kv_heads * head_dim;
+    const long long kv_plane_stride = (long long)block_size * token_stride;
+    const long long layer_stride = 2LL * kv_plane_stride;
+    const long long block_stride = (long long)num_layers * layer_stride;
+    const long long kv_head_idx = head_idx / (num_heads / num_kv_heads);
+    const long long layer_offset = layer_stride * layer_idx;
+
+    // Load query value
+    float q_val = 0.0f;
+    if (tid < head_dim) {
+        q_val = query[(long long)token_idx * num_heads * head_dim + (long long)head_idx * head_dim + tid];
+    }
+
+    // FlashAttention-style online softmax
+    using BlockReduce = cub::BlockReduce<float, 128>;
+    __shared__ typename BlockReduce::TempStorage temp_storage;
+    __shared__ float shared_score;
+
+    float m_prev = -FLT_MAX;
+    float d_prev = 0.0f;
+    float acc_out = 0.0f;
+
+    int num_logical_blocks = (effective_context_len - 1) / block_size + 1;
+
+    for (int i = 0; i < num_logical_blocks; i++) {
+        int physical_block_id = block_tables[seq_idx * max_blocks_per_seq + i];
+        int valid_tokens = block_size;
+        if (i == num_logical_blocks - 1) {
+            valid_tokens = effective_context_len % block_size;
+            if (!valid_tokens) valid_tokens = block_size;
+        }
+
+        for (int t = 0; t < valid_tokens; t++) {
+            const long long offset =
+                (long long)physical_block_id * block_stride +
+                layer_offset +
+                (long long)t * token_stride +
+                kv_head_idx * head_dim +
+                tid;
+
+            float dot = 0.0f;
+            if (tid < head_dim) {
+                float k_val = key_value_cache[offset];
+                dot = q_val * k_val;
+            }
+
+            float dot_sum = BlockReduce(temp_storage).Sum(dot);
+            __syncthreads();
+            if (tid == 0) {
+                shared_score = dot_sum * scale;
+            }
+            __syncthreads();
+
+            float score = shared_score;
+            float m_new = max(m_prev, score);
+            float exp_score = expf(score - m_new);
+            float correction = expf(m_prev - m_new);
+            d_prev = d_prev * correction + exp_score;
+
+            float v_val = 0.0f;
+            if (tid < head_dim) {
+                v_val = key_value_cache[offset + kv_plane_stride];
+            }
+            acc_out = acc_out * correction + v_val * exp_score;
+            m_prev = m_new;
+        }
+    }
+
+    if (tid < head_dim) {
+        float final_out = (d_prev > 0.0f) ? (acc_out / d_prev) : 0.0f;
+        out[(long long)token_idx * num_heads * head_dim + (long long)head_idx * head_dim + tid] = final_out;
+    }
+}
+
+void paged_attention_prefill_cu(
+    tensor::Tensor& out,
+    const tensor::Tensor& query,
+    const tensor::Tensor& key_value_cache,
+    const tensor::Tensor& block_tables,
+    const tensor::Tensor& context_lens,
+    const tensor::Tensor& seq_ids,
+    const tensor::Tensor& positions,
+    int num_tokens,
+    int num_seqs,
+    int num_heads,
+    int num_kv_heads,
+    int head_dim,
+    int block_size,
+    int max_blocks_per_seq,
+    float scale,
+    int layer_idx,
+    int num_layers,
+    cudaStream_t stream
+) {
+    CHECK_EQ(out.is_empty(), false);
+    CHECK_EQ(query.is_empty(), false);
+    CHECK_EQ(key_value_cache.is_empty(), false);
+    CHECK_EQ(block_tables.is_empty(), false);
+    CHECK_EQ(context_lens.is_empty(), false);
+    CHECK_EQ(seq_ids.is_empty(), false);
+    CHECK_EQ(positions.is_empty(), false);
+
+    dim3 grid(num_tokens, num_heads);
+    dim3 block(128);
+
+    paged_attention_prefill_kernel_impl<<<grid, block, 0, stream>>>(
+        out.ptr<float>(),
+        query.ptr<float>(),
+        key_value_cache.ptr<float>(),
+        block_tables.ptr<int>(),
+        context_lens.ptr<int>(),
+        seq_ids.ptr<int>(),
+        positions.ptr<int>(),
+        num_tokens,
+        num_seqs,
+        num_heads,
+        num_kv_heads,
+        head_dim,
+        block_size,
+        max_blocks_per_seq,
+        scale,
+        layer_idx,
+        num_layers
+    );
+}
+
+}
\ No newline at end of file
diff --git a/kuiper/source/op/kernels/cuda/paged_attention_kernel.cuh b/kuiper/source/op/kernels/cuda/paged_attention_kernel.cuh
new file mode 100644
index 0000000..71c29b1
--- /dev/null
+++ b/kuiper/source/op/kernels/cuda/paged_attention_kernel.cuh
@@ -0,0 +1,103 @@
+
+#ifndef PAGED_ATTENTION_KERNEL_CU_H
+#define PAGED_ATTENTION_KERNEL_CU_H
+
+#include "tensor/tensor.h"
+#include <cuda_runtime.h>
+
+namespace kernel {
+
+// 声明 CUDA 实现函数，参数需严格匹配 kernels_interface.h 中的 PagedAttentionKernel
+void paged_attention_kernel_cu(
+    tensor::Tensor& out,
+    const tensor::Tensor& query,
+    const tensor::Tensor& key_value_cache,
+    const tensor::Tensor& block_tables,
+    const tensor::Tensor& context_lens,
+    int max_context_len,
+    int num_seqs,
+    int num_heads,
+    int num_kv_heads,
+    int head_dim,
+    int block_size,
+    int max_blocks_per_seq,
+    float scale,
+    int layer_idx ,
+    int num_layers,
+    cudaStream_t stream
+);
+
+// Batch KV cache write kernel (legacy version)
+// Writes K and V for multiple sequences to paged KV cache
+// Assumes token_idx == seq_idx (one token per sequence)
+// key: [num_seqs, kv_dim]
+// value: [num_seqs, kv_dim]
+// positions: [num_seqs] - token position for each sequence
+void batch_kv_cache_write_cu(
+    tensor::Tensor& key_value_cache,  // [Total_Blocks, Num_Layers, 2, Block_Size, KV_Dim]
+    const tensor::Tensor& key,        // [num_seqs, kv_dim]
+    const tensor::Tensor& value,      // [num_seqs, kv_dim]
+    const tensor::Tensor& block_tables,  // [num_seqs, max_blocks_per_seq]
+    const tensor::Tensor& positions,  // [num_seqs] - token positions
+    int num_seqs,
+    int num_kv_heads,
+    int head_dim,
+    int block_size,
+    int max_blocks_per_seq,
+    int layer_idx,
+    int num_layers,
+    cudaStream_t stream
+);
+
+// Batch KV cache write kernel v2 (unified version)
+// Supports chunked prefill and continuous batching
+// Uses seq_ids to map tokens to sequences
+// key: [num_tokens, kv_dim]
+// value: [num_tokens, kv_dim]
+// positions: [num_tokens] - position for each token
+// seq_ids: [num_tokens] - sequence index for each token
+void batch_kv_cache_write_v2_cu(
+    tensor::Tensor& key_value_cache,     // [Total_Blocks, Num_Layers, 2, Block_Size, KV_Dim]
+    const tensor::Tensor& key,           // [num_tokens, kv_dim]
+    const tensor::Tensor& value,         // [num_tokens, kv_dim]
+    const tensor::Tensor& block_tables,  // [num_seqs, max_blocks_per_seq]
+    const tensor::Tensor& positions,     // [num_tokens] - token positions
+    const tensor::Tensor& seq_ids,       // [num_tokens] - sequence index for each token
+    int num_tokens,
+    int num_kv_heads,
+    int head_dim,
+    int block_size,
+    int max_blocks_per_seq,
+    int layer_idx,
+    int num_layers,
+    cudaStream_t stream
+);
+
+// Paged attention prefill kernel
+// Supports multiple query tokens per sequence with causal mask
+// query: [num_tokens, num_heads, head_dim]
+// Uses positions for causal masking (each query only attends to positions <= its own)
+void paged_attention_prefill_cu(
+    tensor::Tensor& out,                    // [num_tokens, num_heads, head_dim]
+    const tensor::Tensor& query,            // [num_tokens, num_heads, head_dim]
+    const tensor::Tensor& key_value_cache,  // [Total_Blocks, Num_Layers, 2, Block_Size, KV_Dim]
+    const tensor::Tensor& block_tables,     // [num_seqs, max_blocks_per_seq]
+    const tensor::Tensor& context_lens,     // [num_seqs] - context length for each sequence
+    const tensor::Tensor& seq_ids,          // [num_tokens] - sequence index for each token
+    const tensor::Tensor& positions,        // [num_tokens] - position for each token (for causal mask)
+    int num_tokens,
+    int num_seqs,
+    int num_heads,
+    int num_kv_heads,
+    int head_dim,
+    int block_size,
+    int max_blocks_per_seq,
+    float scale,
+    int layer_idx,
+    int num_layers,
+    cudaStream_t stream
+);
+
+} // namespace kernel
+
+#endif // PAGED_ATTENTION_KERNEL_CU_H
diff --git a/kuiper/source/op/kernels/cuda/rope_kernel.cu b/kuiper/source/op/kernels/cuda/rope_kernel.cu
index d704d24..6e9822e 100644
--- a/kuiper/source/op/kernels/cuda/rope_kernel.cu
+++ b/kuiper/source/op/kernels/cuda/rope_kernel.cu
@@ -168,4 +168,132 @@ void rope_kernel_cu(int32_t dim, int32_t kv_dim, int32_t head_size, const tensor
                                              cos_cache.ptr<float>());
   }
 }
+
+// ============================================================================
+// Batched RoPE Kernel for multi-token processing
+// ============================================================================
+// Grid: (num_tokens)
+// Block: processes one token's Q and K vectors
+
+#if defined(LLAMA3_SUPPORT) || defined(QWEN2_SUPPORT) || defined(QWEN3_SUPPORT)
+__global__ void rope_kernel_batch_cu_fp32(
+    int num_tokens, int dim, int kv_dim, int head_size,
+    float* input_q,           // [num_tokens, dim]
+    float* input_k,           // [num_tokens, kv_dim]
+    const int* positions,     // [num_tokens]
+    const float* sin_cache,   // [max_seq_len, head_size]
+    const float* cos_cache) {
+
+  int token_idx = blockIdx.x;
+  int tid = threadIdx.x;
+
+  if (token_idx >= num_tokens) return;
+
+  int pos = positions[token_idx];
+
+  int num_heads = dim / head_size;
+  int head_pair_count = head_size / 2;
+  int total_pairs = num_heads * head_pair_count;
+
+  // Each thread handles one (head, pair) combination
+  for (int idx = tid; idx < total_pairs; idx += blockDim.x) {
+    int head_idx = idx / head_pair_count;
+    int head_dim = idx % head_pair_count;
+
+    int i = head_idx * head_size;
+    int v0_idx = i + head_dim;
+    int v1_idx = i + head_dim + head_size / 2;
+
+    float fci = sin_cache[pos * head_size + head_dim * 2];
+    float fcr = cos_cache[pos * head_size + head_dim * 2];
+
+    int rotn = i < kv_dim ? 2 : 1;
+
+    for (int v = 0; v < rotn; v++) {
+      float* vec = (v == 0) ? (input_q + token_idx * dim) : (input_k + token_idx * kv_dim);
+      float v0 = vec[v0_idx];
+      float v1 = vec[v1_idx];
+      vec[v0_idx] = fcr * v0 - fci * v1;
+      vec[v1_idx] = fcr * v1 + fci * v0;
+    }
+  }
+}
+#else
+// LLama2 version
+__global__ void rope_kernel_batch_cu_fp32(
+    int num_tokens, int dim, int kv_dim, int head_size,
+    float* input_q,           // [num_tokens, dim]
+    float* input_k,           // [num_tokens, kv_dim]
+    const int* positions,     // [num_tokens]
+    const float* sin_cache,   // [max_seq_len, head_size]
+    const float* cos_cache) {
+
+  int token_idx = blockIdx.x;
+  int tid = threadIdx.x;
+
+  if (token_idx >= num_tokens) return;
+
+  int pos = positions[token_idx];
+  float* q_ptr = input_q + token_idx * dim;
+  float* k_ptr = input_k + token_idx * kv_dim;
+
+  // Process Q vector
+  for (int idx = tid * 2; idx < dim; idx += blockDim.x * 2) {
+    int head_dim = idx % head_size;
+    float fci = sin_cache[pos * head_size + head_dim];
+    float fcr = cos_cache[pos * head_size + head_dim];
+
+    float2* vec_ptr = reinterpret_cast<float2*>(q_ptr + idx);
+    float2 vec_value = *vec_ptr;
+    *vec_ptr = make_float2(vec_value.x * fcr - vec_value.y * fci,
+                           vec_value.x * fci + vec_value.y * fcr);
+  }
+
+  // Process K vector
+  for (int idx = tid * 2; idx < kv_dim; idx += blockDim.x * 2) {
+    int head_dim = idx % head_size;
+    float fci = sin_cache[pos * head_size + head_dim];
+    float fcr = cos_cache[pos * head_size + head_dim];
+
+    float2* vec_ptr = reinterpret_cast<float2*>(k_ptr + idx);
+    float2 vec_value = *vec_ptr;
+    *vec_ptr = make_float2(vec_value.x * fcr - vec_value.y * fci,
+                           vec_value.x * fci + vec_value.y * fcr);
+  }
+}
+#endif
+
+void rope_kernel_batch_cu(int32_t num_tokens, int32_t dim, int32_t kv_dim, int32_t head_size,
+                          const tensor::Tensor& input_q, const tensor::Tensor& input_k,
+                          const tensor::Tensor& positions, const tensor::Tensor& sin_cache,
+                          const tensor::Tensor& cos_cache, void* stream) {
+  CHECK(!input_q.is_empty());
+  CHECK(!input_k.is_empty());
+  CHECK(!positions.is_empty());
+  CHECK(!sin_cache.is_empty());
+  CHECK(!cos_cache.is_empty());
+
+  int threads = 128;
+  int blocks = num_tokens;  // One block per token
+
+  if (stream) {
+    cudaStream_t stream_ = static_cast<cudaStream_t>(stream);
+    rope_kernel_batch_cu_fp32<<<blocks, threads, 0, stream_>>>(
+        num_tokens, dim, kv_dim, head_size,
+        const_cast<float*>(input_q.ptr<float>()),
+        const_cast<float*>(input_k.ptr<float>()),
+        positions.ptr<int>(),
+        sin_cache.ptr<float>(),
+        cos_cache.ptr<float>());
+  } else {
+    rope_kernel_batch_cu_fp32<<<blocks, threads>>>(
+        num_tokens, dim, kv_dim, head_size,
+        const_cast<float*>(input_q.ptr<float>()),
+        const_cast<float*>(input_k.ptr<float>()),
+        positions.ptr<int>(),
+        sin_cache.ptr<float>(),
+        cos_cache.ptr<float>());
+  }
+}
+
 }  // namespace kernel
\ No newline at end of file
diff --git a/kuiper/source/op/kernels/cuda/rope_kernel.cuh b/kuiper/source/op/kernels/cuda/rope_kernel.cuh
index 4189226..2d12db0 100644
--- a/kuiper/source/op/kernels/cuda/rope_kernel.cuh
+++ b/kuiper/source/op/kernels/cuda/rope_kernel.cuh
@@ -2,10 +2,21 @@
 #define ROPE_KERNEL_CU_CUH
 #include "tensor/tensor.h"
 namespace kernel {
+
+// Original single-token RoPE
 void rope_kernel_cu(int32_t dim, int32_t kv_dim, int32_t head_size, const tensor::Tensor& input_q,
                     const tensor::Tensor& input_k, const tensor::Tensor& input_pos,
                     const tensor::Tensor& sin_cache, const tensor::Tensor& cos_cache, void* stream);
 
+// Batched RoPE for multi-token processing (Prefill or multi-sequence Decode)
+// input_q: [num_tokens, dim]
+// input_k: [num_tokens, kv_dim]
+// positions: [num_tokens] - position for each token
+void rope_kernel_batch_cu(int32_t num_tokens, int32_t dim, int32_t kv_dim, int32_t head_size,
+                          const tensor::Tensor& input_q, const tensor::Tensor& input_k,
+                          const tensor::Tensor& positions, const tensor::Tensor& sin_cache,
+                          const tensor::Tensor& cos_cache, void* stream);
+
 void sin_cos_cache_calc_cu(int head_size, int max_seq_len, const tensor::Tensor& sin_cache,
                            const tensor::Tensor& cos_cache, cudaStream_t stream);
 
diff --git a/kuiper/source/op/kernels/kernels_interface.h b/kuiper/source/op/kernels/kernels_interface.h
index 7c503c5..5dba02a 100644
--- a/kuiper/source/op/kernels/kernels_interface.h
+++ b/kuiper/source/op/kernels/kernels_interface.h
@@ -38,6 +38,12 @@ typedef void (*RoPEKernel)(int32_t dim, int32_t kv_dim, int32_t head_size,
                            const tensor::Tensor& input_pos, const tensor::Tensor& sin_cache,
                            const tensor::Tensor& cos_cache, void* stream);
 
+// Batched RoPE kernel for multi-token processing
+typedef void (*RoPEBatchKernel)(int32_t num_tokens, int32_t dim, int32_t kv_dim, int32_t head_size,
+                                const tensor::Tensor& input_q, const tensor::Tensor& input_k,
+                                const tensor::Tensor& positions, const tensor::Tensor& sin_cache,
+                                const tensor::Tensor& cos_cache, void* stream);
+
 typedef void (*ScaleKernel)(float scale, const tensor::Tensor& input, void* stream);
 
 typedef void (*SoftmaxInplaceKernel)(const tensor::Tensor& input, void* stream);
@@ -46,6 +52,16 @@ typedef void (*ScaleSumKernel)(const tensor::Tensor& value, const tensor::Tensor
                                const tensor::Tensor& output, int t, int size, int stride,
                                void* stream);
 
+// GEMM kernel for multi-token matrix multiplication (Prefill phase)
+// Input: [num_tokens, in_dim], Weight: [out_dim, in_dim], Output: [num_tokens, out_dim]
+typedef void (*GemmKernel)(const tensor::Tensor& input, const tensor::Tensor& weight,
+                           const tensor::Tensor& output, const CudaConfig* config);
+
+// Batched GEMV kernel for multi-sequence decode phase
+// Input: [batch_size, in_dim], Weight: [out_dim, in_dim], Output: [batch_size, out_dim]
+typedef void (*GemvBatchedKernel)(const tensor::Tensor& input, const tensor::Tensor& weight,
+                                  const tensor::Tensor& output, const CudaConfig* config);
+
 void softmax_inplace_cpu(const float* input_ptr, size_t size);
 
 AddKernel get_add_kernel(base::DeviceType device_type);
@@ -62,6 +78,9 @@ RMSNormKernel get_rmsnorm_kernel(base::DeviceType device_type);
 
 RoPEKernel get_rope_kernel(base::DeviceType device_type);
 
+// Get batched RoPE kernel for multi-token processing
+RoPEBatchKernel get_rope_batch_kernel(base::DeviceType device_type);
+
 ScaleKernel get_scale_kernel(base::DeviceType device_type);
 
 SoftmaxInplaceKernel get_softmax_kernel(base::DeviceType device_type);
@@ -71,5 +90,12 @@ SwigluKernel get_swiglu_kernel(base::DeviceType device_type, void* stream = null
 ScaleSumKernel get_scale_sum_kernel(base::DeviceType device_type);
 
 RMSNormKernelDim get_rmsnorm_dim_kernel(base::DeviceType device_type);
+
+// Get GEMM kernel for multi-token matrix multiplication
+GemmKernel get_gemm_kernel(base::DeviceType device_type);
+
+// Get batched GEMV kernel for multi-sequence decode
+GemvBatchedKernel get_gemv_batched_kernel(base::DeviceType device_type);
+
 }  // namespace kernel
 #endif  // KERNELS_INTERFACE_H
diff --git a/kuiper/source/op/kernels/kernels_interfaces.cpp b/kuiper/source/op/kernels/kernels_interfaces.cpp
index 6001dd3..9131fa0 100644
--- a/kuiper/source/op/kernels/kernels_interfaces.cpp
+++ b/kuiper/source/op/kernels/kernels_interfaces.cpp
@@ -11,6 +11,7 @@
 #include "cpu/swiglu_kernel.h"
 #include "cuda/add_kernel.cuh"
 #include "cuda/emb_kernel.cuh"
+#include "cuda/gemm_kernel.cuh"
 #include "cuda/matmul_kernel.cuh"
 #include "cuda/mha_kernel.cuh"
 #include "cuda/rmsnorm_kernel.cuh"
@@ -82,6 +83,15 @@ RoPEKernel get_rope_kernel(base::DeviceType device_type) {
   }
 }
 
+RoPEBatchKernel get_rope_batch_kernel(base::DeviceType device_type) {
+  if (device_type == base::DeviceType::kDeviceCUDA) {
+    return rope_kernel_batch_cu;
+  } else {
+    LOG(FATAL) << "Batched RoPE kernel is only supported on CUDA device.";
+    return nullptr;
+  }
+}
+
 ScaleKernel get_scale_kernel(base::DeviceType device_type) {
   if (device_type == base::DeviceType::kDeviceCPU) {
     return scale_inplace_cpu;
@@ -140,4 +150,22 @@ ScaleSumKernel get_scale_sum_kernel(base::DeviceType device_type) {
   }
 }
 
+GemmKernel get_gemm_kernel(base::DeviceType device_type) {
+  if (device_type == base::DeviceType::kDeviceCUDA) {
+    return gemm_kernel_cu;
+  } else {
+    LOG(FATAL) << "GEMM kernel is only supported on CUDA device.";
+    return nullptr;
+  }
+}
+
+GemvBatchedKernel get_gemv_batched_kernel(base::DeviceType device_type) {
+  if (device_type == base::DeviceType::kDeviceCUDA) {
+    return gemv_batched_kernel_cu;
+  } else {
+    LOG(FATAL) << "Batched GEMV kernel is only supported on CUDA device.";
+    return nullptr;
+  }
+}
+
 }  // namespace kernel
diff --git a/kuiper/source/op/swiglu.cpp b/kuiper/source/op/swiglu.cpp
index 977b49a..b290a3b 100644
--- a/kuiper/source/op/swiglu.cpp
+++ b/kuiper/source/op/swiglu.cpp
@@ -45,3 +45,11 @@ base::Status SwiGLULayer::forward() {
 }
 
 }  // namespace op
+
+
+
+
+
+
+
+
diff --git a/kuiper/source/tensor/tensor.cpp b/kuiper/source/tensor/tensor.cpp
index 3c2336d..e36f39a 100644
--- a/kuiper/source/tensor/tensor.cpp
+++ b/kuiper/source/tensor/tensor.cpp
@@ -101,6 +101,30 @@ Tensor::Tensor(base::DataType data_type, std::vector<int32_t> dims, bool need_al
   }
 }
 
+Tensor::Tensor(base::DataType data_type, size_t size, bool need_alloc,
+               std::shared_ptr<base::DeviceAllocator> alloc, void* ptr)
+    : data_type_(data_type), size_(size) {
+  // For large tensors, store size as a single dimension (may overflow int32_t, but size_ is correct)
+  // dims_ is used for shape info, size_ is used for actual allocation
+  if (size <= static_cast<size_t>(INT32_MAX)) {
+    dims_.push_back(static_cast<int32_t>(size));
+  } else {
+    // Store as two dimensions to avoid int32 overflow in dims_
+    // This is just for bookkeeping; actual size is in size_
+    dims_.push_back(INT32_MAX);
+    dims_.push_back(static_cast<int32_t>((size + INT32_MAX - 1) / INT32_MAX));
+  }
+  if (need_alloc && alloc) {
+    allocate(alloc);
+  } else {
+    if (ptr != nullptr) {
+      CHECK(need_alloc == false)
+          << "The need_alloc is true when ptr parameter is not a null pointer.";
+      init_buffer(alloc, data_type_, need_alloc, ptr);
+    }
+  }
+}
+
 void Tensor::to_cuda(cudaStream_t stream) {
   CHECK_NE(buffer_, nullptr);
   const base::DeviceType device_type = this->device_type();
diff --git a/test/test_op/test_cu_gemm.cpp b/test/test_op/test_cu_gemm.cpp
new file mode 100644
index 0000000..8aa6869
--- /dev/null
+++ b/test/test_op/test_cu_gemm.cpp
@@ -0,0 +1,345 @@
+#include <cuda_runtime_api.h>
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+#include <cmath>
+#include <random>
+#include "../source/op/kernels/kernels_interface.h"
+#include "base/buffer.h"
+
+using namespace kernel;
+
+// CPU reference implementation for GEMM
+// C[M,N] = A[M,K] x B[N,K]^T (B stored as [N,K])
+void gemm_cpu_reference(const float* A, const float* B, float* C,
+                        int M, int K, int N) {
+  for (int m = 0; m < M; ++m) {
+    for (int n = 0; n < N; ++n) {
+      float sum = 0.0f;
+      for (int k = 0; k < K; ++k) {
+        // A[m,k] * B[n,k]
+        sum += A[m * K + k] * B[n * K + k];
+      }
+      C[m * N + n] = sum;
+    }
+  }
+}
+
+// Test basic GEMM with small matrices
+TEST(test_gemm_cu, gemm_basic_small) {
+  auto alloc_cu = base::CUDADeviceAllocatorFactory::get_instance();
+  auto alloc_cpu = base::CPUDeviceAllocatorFactory::get_instance();
+
+  const int M = 4;  // num_tokens
+  const int K = 8;  // in_dim
+  const int N = 6;  // out_dim
+
+  // Create input tensors on CPU
+  tensor::Tensor input(base::DataType::kDataTypeFp32, M, K, true, alloc_cpu);
+  tensor::Tensor weight(base::DataType::kDataTypeFp32, N, K, true, alloc_cpu);
+  tensor::Tensor output_cpu(base::DataType::kDataTypeFp32, M, N, true, alloc_cpu);
+
+  // Initialize with simple values
+  for (int i = 0; i < M * K; ++i) {
+    input.index<float>(i) = static_cast<float>(i % 7);
+  }
+  for (int i = 0; i < N * K; ++i) {
+    weight.index<float>(i) = static_cast<float>((i % 5) - 2);
+  }
+
+  // CPU reference
+  gemm_cpu_reference(input.ptr<float>(), weight.ptr<float>(),
+                     output_cpu.ptr<float>(), M, K, N);
+
+  // Copy to GPU
+  input.to_cuda(nullptr);
+  weight.to_cuda(nullptr);
+
+  tensor::Tensor output_cu(base::DataType::kDataTypeFp32, M, N, true, alloc_cu);
+
+  // Run GEMM kernel
+  // Note: CudaConfig destructor will destroy the stream
+  CudaConfig* config = new CudaConfig;
+  cudaStreamCreate(&config->stream);
+  kernel::get_gemm_kernel(base::DeviceType::kDeviceCUDA)(input, weight, output_cu, config);
+  cudaStreamSynchronize(config->stream);
+
+  // Copy back and compare
+  output_cu.to_cpu();
+
+  for (int i = 0; i < M * N; ++i) {
+    ASSERT_NEAR(output_cu.index<float>(i), output_cpu.index<float>(i), 1e-4f)
+        << "Mismatch at index " << i;
+  }
+
+  delete config;
+}
+
+// Test GEMM with typical LLM dimensions
+TEST(test_gemm_cu, gemm_llm_dimensions) {
+  auto alloc_cu = base::CUDADeviceAllocatorFactory::get_instance();
+  auto alloc_cpu = base::CPUDeviceAllocatorFactory::get_instance();
+
+  const int M = 32;    // num_tokens (small batch for testing)
+  const int K = 256;   // in_dim (reduced for testing)
+  const int N = 512;   // out_dim
+
+  tensor::Tensor input(base::DataType::kDataTypeFp32, M, K, true, alloc_cpu);
+  tensor::Tensor weight(base::DataType::kDataTypeFp32, N, K, true, alloc_cpu);
+  tensor::Tensor output_cpu(base::DataType::kDataTypeFp32, M, N, true, alloc_cpu);
+
+  // Random initialization
+  std::mt19937 gen(42);
+  std::uniform_real_distribution<float> dist(-1.0f, 1.0f);
+
+  for (int i = 0; i < M * K; ++i) {
+    input.index<float>(i) = dist(gen);
+  }
+  for (int i = 0; i < N * K; ++i) {
+    weight.index<float>(i) = dist(gen);
+  }
+
+  // CPU reference
+  gemm_cpu_reference(input.ptr<float>(), weight.ptr<float>(),
+                     output_cpu.ptr<float>(), M, K, N);
+
+  // GPU computation
+  input.to_cuda(nullptr);
+  weight.to_cuda(nullptr);
+
+  tensor::Tensor output_cu(base::DataType::kDataTypeFp32, M, N, true, alloc_cu);
+
+  CudaConfig* config = new CudaConfig;
+  cudaStreamCreate(&config->stream);
+  kernel::get_gemm_kernel(base::DeviceType::kDeviceCUDA)(input, weight, output_cu, config);
+  cudaStreamSynchronize(config->stream);
+
+  output_cu.to_cpu();
+
+  // Compare with tolerance for floating point
+  float max_diff = 0.0f;
+  for (int i = 0; i < M * N; ++i) {
+    float diff = std::abs(output_cu.index<float>(i) - output_cpu.index<float>(i));
+    max_diff = std::max(max_diff, diff);
+    ASSERT_NEAR(output_cu.index<float>(i), output_cpu.index<float>(i), 1e-3f)
+        << "Mismatch at index " << i << ", diff = " << diff;
+  }
+  LOG(INFO) << "Max difference: " << max_diff;
+
+  delete config;
+}
+
+// Test GEMM with single token (should fall back to GEMV)
+TEST(test_gemm_cu, gemm_single_token_fallback) {
+  auto alloc_cu = base::CUDADeviceAllocatorFactory::get_instance();
+  auto alloc_cpu = base::CPUDeviceAllocatorFactory::get_instance();
+
+  const int K = 128;  // in_dim
+  const int N = 64;   // out_dim
+
+  // Single token: 1D input
+  tensor::Tensor input(base::DataType::kDataTypeFp32, K, true, alloc_cpu);
+  tensor::Tensor weight(base::DataType::kDataTypeFp32, N, K, true, alloc_cpu);
+  tensor::Tensor output_cpu(base::DataType::kDataTypeFp32, N, true, alloc_cpu);
+
+  std::mt19937 gen(123);
+  std::uniform_real_distribution<float> dist(-1.0f, 1.0f);
+
+  for (int i = 0; i < K; ++i) {
+    input.index<float>(i) = dist(gen);
+  }
+  for (int i = 0; i < N * K; ++i) {
+    weight.index<float>(i) = dist(gen);
+  }
+
+  // CPU reference (M=1)
+  gemm_cpu_reference(input.ptr<float>(), weight.ptr<float>(),
+                     output_cpu.ptr<float>(), 1, K, N);
+
+  input.to_cuda(nullptr);
+  weight.to_cuda(nullptr);
+
+  tensor::Tensor output_cu(base::DataType::kDataTypeFp32, N, true, alloc_cu);
+
+  CudaConfig* config = new CudaConfig;
+  cudaStreamCreate(&config->stream);
+  kernel::get_gemm_kernel(base::DeviceType::kDeviceCUDA)(input, weight, output_cu, config);
+  cudaStreamSynchronize(config->stream);
+
+  output_cu.to_cpu();
+
+  for (int i = 0; i < N; ++i) {
+    ASSERT_NEAR(output_cu.index<float>(i), output_cpu.index<float>(i), 1e-4f)
+        << "Mismatch at index " << i;
+  }
+
+  delete config;
+}
+
+// Test batched GEMV for decode phase
+TEST(test_gemm_cu, gemv_batched_decode) {
+  auto alloc_cu = base::CUDADeviceAllocatorFactory::get_instance();
+  auto alloc_cpu = base::CPUDeviceAllocatorFactory::get_instance();
+
+  const int batch_size = 8;  // num sequences
+  const int K = 256;         // in_dim
+  const int N = 128;         // out_dim
+
+  tensor::Tensor input(base::DataType::kDataTypeFp32, batch_size, K, true, alloc_cpu);
+  tensor::Tensor weight(base::DataType::kDataTypeFp32, N, K, true, alloc_cpu);
+  tensor::Tensor output_cpu(base::DataType::kDataTypeFp32, batch_size, N, true, alloc_cpu);
+
+  std::mt19937 gen(456);
+  std::uniform_real_distribution<float> dist(-1.0f, 1.0f);
+
+  for (int i = 0; i < batch_size * K; ++i) {
+    input.index<float>(i) = dist(gen);
+  }
+  for (int i = 0; i < N * K; ++i) {
+    weight.index<float>(i) = dist(gen);
+  }
+
+  // CPU reference
+  gemm_cpu_reference(input.ptr<float>(), weight.ptr<float>(),
+                     output_cpu.ptr<float>(), batch_size, K, N);
+
+  input.to_cuda(nullptr);
+  weight.to_cuda(nullptr);
+
+  tensor::Tensor output_cu(base::DataType::kDataTypeFp32, batch_size, N, true, alloc_cu);
+
+  CudaConfig* config = new CudaConfig;
+  cudaStreamCreate(&config->stream);
+  kernel::get_gemv_batched_kernel(base::DeviceType::kDeviceCUDA)(input, weight, output_cu, config);
+  cudaStreamSynchronize(config->stream);
+
+  output_cu.to_cpu();
+
+  float max_diff = 0.0f;
+  for (int i = 0; i < batch_size * N; ++i) {
+    float diff = std::abs(output_cu.index<float>(i) - output_cpu.index<float>(i));
+    max_diff = std::max(max_diff, diff);
+    ASSERT_NEAR(output_cu.index<float>(i), output_cpu.index<float>(i), 1e-3f)
+        << "Mismatch at index " << i;
+  }
+  LOG(INFO) << "Batched GEMV max difference: " << max_diff;
+
+  delete config;
+}
+
+// Test GEMM with non-aligned dimensions (edge cases)
+TEST(test_gemm_cu, gemm_non_aligned) {
+  auto alloc_cu = base::CUDADeviceAllocatorFactory::get_instance();
+  auto alloc_cpu = base::CPUDeviceAllocatorFactory::get_instance();
+
+  // Non-power-of-2 dimensions
+  const int M = 17;   // num_tokens
+  const int K = 67;   // in_dim
+  const int N = 33;   // out_dim
+
+  tensor::Tensor input(base::DataType::kDataTypeFp32, M, K, true, alloc_cpu);
+  tensor::Tensor weight(base::DataType::kDataTypeFp32, N, K, true, alloc_cpu);
+  tensor::Tensor output_cpu(base::DataType::kDataTypeFp32, M, N, true, alloc_cpu);
+
+  std::mt19937 gen(789);
+  std::uniform_real_distribution<float> dist(-0.5f, 0.5f);
+
+  for (int i = 0; i < M * K; ++i) {
+    input.index<float>(i) = dist(gen);
+  }
+  for (int i = 0; i < N * K; ++i) {
+    weight.index<float>(i) = dist(gen);
+  }
+
+  gemm_cpu_reference(input.ptr<float>(), weight.ptr<float>(),
+                     output_cpu.ptr<float>(), M, K, N);
+
+  input.to_cuda(nullptr);
+  weight.to_cuda(nullptr);
+
+  tensor::Tensor output_cu(base::DataType::kDataTypeFp32, M, N, true, alloc_cu);
+
+  CudaConfig* config = new CudaConfig;
+  cudaStreamCreate(&config->stream);
+  kernel::get_gemm_kernel(base::DeviceType::kDeviceCUDA)(input, weight, output_cu, config);
+  cudaStreamSynchronize(config->stream);
+
+  output_cu.to_cpu();
+
+  for (int i = 0; i < M * N; ++i) {
+    ASSERT_NEAR(output_cu.index<float>(i), output_cpu.index<float>(i), 1e-3f)
+        << "Mismatch at index " << i;
+  }
+
+  delete config;
+}
+
+// Test larger GEMM for performance validation
+TEST(test_gemm_cu, gemm_large_prefill) {
+  auto alloc_cu = base::CUDADeviceAllocatorFactory::get_instance();
+  auto alloc_cpu = base::CPUDeviceAllocatorFactory::get_instance();
+
+  const int M = 128;   // num_tokens (prefill)
+  const int K = 512;   // in_dim
+  const int N = 512;   // out_dim
+
+  tensor::Tensor input(base::DataType::kDataTypeFp32, M, K, true, alloc_cpu);
+  tensor::Tensor weight(base::DataType::kDataTypeFp32, N, K, true, alloc_cpu);
+  tensor::Tensor output_cpu(base::DataType::kDataTypeFp32, M, N, true, alloc_cpu);
+
+  std::mt19937 gen(999);
+  std::uniform_real_distribution<float> dist(-1.0f, 1.0f);
+
+  for (int i = 0; i < M * K; ++i) {
+    input.index<float>(i) = dist(gen);
+  }
+  for (int i = 0; i < N * K; ++i) {
+    weight.index<float>(i) = dist(gen);
+  }
+
+  gemm_cpu_reference(input.ptr<float>(), weight.ptr<float>(),
+                     output_cpu.ptr<float>(), M, K, N);
+
+  input.to_cuda(nullptr);
+  weight.to_cuda(nullptr);
+
+  tensor::Tensor output_cu(base::DataType::kDataTypeFp32, M, N, true, alloc_cu);
+
+  CudaConfig* config = new CudaConfig;
+  cudaStreamCreate(&config->stream);
+
+  // Warmup
+  kernel::get_gemm_kernel(base::DeviceType::kDeviceCUDA)(input, weight, output_cu, config);
+  cudaStreamSynchronize(config->stream);
+
+  // Timing
+  cudaEvent_t start, stop;
+  cudaEventCreate(&start);
+  cudaEventCreate(&stop);
+
+  const int num_iterations = 10;
+  cudaEventRecord(start, config->stream);
+  for (int i = 0; i < num_iterations; ++i) {
+    kernel::get_gemm_kernel(base::DeviceType::kDeviceCUDA)(input, weight, output_cu, config);
+  }
+  cudaEventRecord(stop, config->stream);
+  cudaEventSynchronize(stop);
+
+  float milliseconds = 0;
+  cudaEventElapsedTime(&milliseconds, start, stop);
+  LOG(INFO) << "GEMM [" << M << "x" << K << "] x [" << N << "x" << K << "]^T: "
+            << milliseconds / num_iterations << " ms per iteration";
+
+  output_cu.to_cpu();
+
+  float max_diff = 0.0f;
+  for (int i = 0; i < M * N; ++i) {
+    float diff = std::abs(output_cu.index<float>(i) - output_cpu.index<float>(i));
+    max_diff = std::max(max_diff, diff);
+  }
+  LOG(INFO) << "Max difference: " << max_diff;
+  ASSERT_LT(max_diff, 1e-2f) << "Results differ too much from CPU reference";
+
+  cudaEventDestroy(start);
+  cudaEventDestroy(stop);
+  delete config;
+}
diff --git a/test/test_op/test_cu_rope_batch.cpp b/test/test_op/test_cu_rope_batch.cpp
new file mode 100644
index 0000000..27c6fa4
--- /dev/null
+++ b/test/test_op/test_cu_rope_batch.cpp
@@ -0,0 +1,223 @@
+#include <cuda_runtime_api.h>
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+#include <cmath>
+#include <random>
+#include "../source/op/kernels/kernels_interface.h"
+#include "../source/op/kernels/cuda/rope_kernel.cuh"
+#include "base/buffer.h"
+
+using namespace kernel;
+
+// Test batched RoPE kernel
+TEST(test_rope_batch_cu, rope_batch_basic) {
+  auto alloc_cu = base::CUDADeviceAllocatorFactory::get_instance();
+  auto alloc_cpu = base::CPUDeviceAllocatorFactory::get_instance();
+
+  const int num_tokens = 4;
+  const int num_heads = 8;
+  const int head_size = 64;
+  const int dim = num_heads * head_size;      // 512
+  const int num_kv_heads = 4;
+  const int kv_dim = num_kv_heads * head_size; // 256
+  const int max_seq_len = 128;
+
+  // Create tensors
+  tensor::Tensor input_q(base::DataType::kDataTypeFp32, num_tokens, dim, true, alloc_cpu);
+  tensor::Tensor input_k(base::DataType::kDataTypeFp32, num_tokens, kv_dim, true, alloc_cpu);
+  tensor::Tensor positions(base::DataType::kDataTypeInt32, num_tokens, true, alloc_cpu);
+  tensor::Tensor sin_cache(base::DataType::kDataTypeFp32, max_seq_len, head_size, true, alloc_cpu);
+  tensor::Tensor cos_cache(base::DataType::kDataTypeFp32, max_seq_len, head_size, true, alloc_cpu);
+
+  // Initialize Q and K with random values
+  std::mt19937 gen(42);
+  std::uniform_real_distribution<float> dist(-1.0f, 1.0f);
+
+  for (int i = 0; i < num_tokens * dim; ++i) {
+    input_q.index<float>(i) = dist(gen);
+  }
+  for (int i = 0; i < num_tokens * kv_dim; ++i) {
+    input_k.index<float>(i) = dist(gen);
+  }
+
+  // Set positions: [0, 1, 2, 3]
+  for (int i = 0; i < num_tokens; ++i) {
+    positions.index<int32_t>(i) = i;
+  }
+
+  // Clone for comparison
+  tensor::Tensor input_q_ref = input_q.clone();
+  tensor::Tensor input_k_ref = input_k.clone();
+
+  // Move to GPU
+  input_q.to_cuda(nullptr);
+  input_k.to_cuda(nullptr);
+  positions.to_cuda(nullptr);
+  sin_cache.to_cuda(nullptr);
+  cos_cache.to_cuda(nullptr);
+
+  // Calculate sin/cos cache on GPU
+  CudaConfig* config = new CudaConfig;
+  cudaStreamCreate(&config->stream);
+
+  kernel::sin_cos_cache_calc_cu(head_size, max_seq_len, sin_cache, cos_cache, config->stream);
+  cudaStreamSynchronize(config->stream);
+
+  // Run batched RoPE kernel
+  kernel::get_rope_batch_kernel(base::DeviceType::kDeviceCUDA)(
+      num_tokens, dim, kv_dim, head_size,
+      input_q, input_k, positions, sin_cache, cos_cache, config->stream);
+  cudaStreamSynchronize(config->stream);
+
+  // Copy results back
+  input_q.to_cpu();
+  input_k.to_cpu();
+
+  // Compute reference using single-token RoPE kernel for each token
+  for (int t = 0; t < num_tokens; ++t) {
+    // Create single-token tensors
+    tensor::Tensor q_single(base::DataType::kDataTypeFp32, dim, true, alloc_cpu);
+    tensor::Tensor k_single(base::DataType::kDataTypeFp32, kv_dim, true, alloc_cpu);
+    // pos_single must stay on CPU because rope_kernel_cu reads it on host
+    tensor::Tensor pos_single(base::DataType::kDataTypeInt32, 1, true, alloc_cpu);
+
+    // Copy data
+    for (int i = 0; i < dim; ++i) {
+      q_single.index<float>(i) = input_q_ref.index<float>(t * dim + i);
+    }
+    for (int i = 0; i < kv_dim; ++i) {
+      k_single.index<float>(i) = input_k_ref.index<float>(t * kv_dim + i);
+    }
+    pos_single.index<int32_t>(0) = t;
+
+    // Move Q and K to GPU (pos stays on CPU!)
+    q_single.to_cuda(nullptr);
+    k_single.to_cuda(nullptr);
+
+    // Run single-token RoPE
+    kernel::get_rope_kernel(base::DeviceType::kDeviceCUDA)(
+        dim, kv_dim, head_size, q_single, k_single, pos_single,
+        sin_cache, cos_cache, config->stream);
+    cudaStreamSynchronize(config->stream);
+
+    // Copy back
+    q_single.to_cpu();
+    k_single.to_cpu();
+
+    // Compare Q
+    for (int i = 0; i < dim; ++i) {
+      ASSERT_NEAR(input_q.index<float>(t * dim + i), q_single.index<float>(i), 1e-5f)
+          << "Q mismatch at token " << t << ", index " << i;
+    }
+
+    // Compare K
+    for (int i = 0; i < kv_dim; ++i) {
+      ASSERT_NEAR(input_k.index<float>(t * kv_dim + i), k_single.index<float>(i), 1e-5f)
+          << "K mismatch at token " << t << ", index " << i;
+    }
+  }
+
+  LOG(INFO) << "Batched RoPE test passed for " << num_tokens << " tokens";
+  delete config;
+}
+
+// Test batched RoPE with non-sequential positions (for multi-sequence decode)
+TEST(test_rope_batch_cu, rope_batch_multi_sequence) {
+  auto alloc_cu = base::CUDADeviceAllocatorFactory::get_instance();
+  auto alloc_cpu = base::CPUDeviceAllocatorFactory::get_instance();
+
+  const int num_tokens = 4;  // 4 sequences, each with 1 token
+  const int num_heads = 8;
+  const int head_size = 64;
+  const int dim = num_heads * head_size;
+  const int num_kv_heads = 4;
+  const int kv_dim = num_kv_heads * head_size;
+  const int max_seq_len = 256;
+
+  tensor::Tensor input_q(base::DataType::kDataTypeFp32, num_tokens, dim, true, alloc_cpu);
+  tensor::Tensor input_k(base::DataType::kDataTypeFp32, num_tokens, kv_dim, true, alloc_cpu);
+  tensor::Tensor positions(base::DataType::kDataTypeInt32, num_tokens, true, alloc_cpu);
+  tensor::Tensor sin_cache(base::DataType::kDataTypeFp32, max_seq_len, head_size, true, alloc_cpu);
+  tensor::Tensor cos_cache(base::DataType::kDataTypeFp32, max_seq_len, head_size, true, alloc_cpu);
+
+  std::mt19937 gen(123);
+  std::uniform_real_distribution<float> dist(-1.0f, 1.0f);
+
+  for (int i = 0; i < num_tokens * dim; ++i) {
+    input_q.index<float>(i) = dist(gen);
+  }
+  for (int i = 0; i < num_tokens * kv_dim; ++i) {
+    input_k.index<float>(i) = dist(gen);
+  }
+
+  // Non-sequential positions (simulating different sequences at different positions)
+  // Seq 0 at pos 10, Seq 1 at pos 25, Seq 2 at pos 5, Seq 3 at pos 100
+  int pos_values[] = {10, 25, 5, 100};
+  positions.index<int32_t>(0) = pos_values[0];
+  positions.index<int32_t>(1) = pos_values[1];
+  positions.index<int32_t>(2) = pos_values[2];
+  positions.index<int32_t>(3) = pos_values[3];
+
+  tensor::Tensor input_q_ref = input_q.clone();
+  tensor::Tensor input_k_ref = input_k.clone();
+
+  input_q.to_cuda(nullptr);
+  input_k.to_cuda(nullptr);
+  positions.to_cuda(nullptr);
+  sin_cache.to_cuda(nullptr);
+  cos_cache.to_cuda(nullptr);
+
+  CudaConfig* config = new CudaConfig;
+  cudaStreamCreate(&config->stream);
+
+  kernel::sin_cos_cache_calc_cu(head_size, max_seq_len, sin_cache, cos_cache, config->stream);
+  cudaStreamSynchronize(config->stream);
+
+  kernel::get_rope_batch_kernel(base::DeviceType::kDeviceCUDA)(
+      num_tokens, dim, kv_dim, head_size,
+      input_q, input_k, positions, sin_cache, cos_cache, config->stream);
+  cudaStreamSynchronize(config->stream);
+
+  input_q.to_cpu();
+  input_k.to_cpu();
+
+  // Verify each token independently
+  for (int t = 0; t < num_tokens; ++t) {
+    tensor::Tensor q_single(base::DataType::kDataTypeFp32, dim, true, alloc_cpu);
+    tensor::Tensor k_single(base::DataType::kDataTypeFp32, kv_dim, true, alloc_cpu);
+    // pos_single must stay on CPU!
+    tensor::Tensor pos_single(base::DataType::kDataTypeInt32, 1, true, alloc_cpu);
+
+    for (int i = 0; i < dim; ++i) {
+      q_single.index<float>(i) = input_q_ref.index<float>(t * dim + i);
+    }
+    for (int i = 0; i < kv_dim; ++i) {
+      k_single.index<float>(i) = input_k_ref.index<float>(t * kv_dim + i);
+    }
+    pos_single.index<int32_t>(0) = pos_values[t];
+
+    q_single.to_cuda(nullptr);
+    k_single.to_cuda(nullptr);
+    // pos_single stays on CPU
+
+    kernel::get_rope_kernel(base::DeviceType::kDeviceCUDA)(
+        dim, kv_dim, head_size, q_single, k_single, pos_single,
+        sin_cache, cos_cache, config->stream);
+    cudaStreamSynchronize(config->stream);
+
+    q_single.to_cpu();
+    k_single.to_cpu();
+
+    for (int i = 0; i < dim; ++i) {
+      ASSERT_NEAR(input_q.index<float>(t * dim + i), q_single.index<float>(i), 1e-5f)
+          << "Q mismatch at token " << t << " (pos=" << pos_values[t] << "), index " << i;
+    }
+    for (int i = 0; i < kv_dim; ++i) {
+      ASSERT_NEAR(input_k.index<float>(t * kv_dim + i), k_single.index<float>(i), 1e-5f)
+          << "K mismatch at token " << t << " (pos=" << pos_values[t] << "), index " << i;
+    }
+  }
+
+  LOG(INFO) << "Multi-sequence batched RoPE test passed";
+  delete config;
+}