diff --git a/.vscode/c_cpp_properties.json b/.vscode/c_cpp_properties.json new file mode 100644 index 0000000..01b3388 --- /dev/null +++ b/.vscode/c_cpp_properties.json @@ -0,0 +1,21 @@ +{ + "configurations": [ + { + "name": "Linux", + "includePath": [ + "${workspaceFolder}/**", + "${workspaceFolder}/kuiper/include", + "/usr/local/cuda/include", + "/usr/include/c++/**", + "/usr/include" + ], + "defines": [], + "compilerPath": "/usr/bin/g++", + "cStandard": "c17", + "cppStandard": "c++17", + "intelliSenseMode": "linux-gcc-x64", + "compileCommands": "${workspaceFolder}/build/compile_commands.json" + } + ], + "version": 4 +} diff --git a/.vscode/launch.json b/.vscode/launch.json new file mode 100644 index 0000000..630ce10 --- /dev/null +++ b/.vscode/launch.json @@ -0,0 +1,30 @@ +{ + // Use IntelliSense to learn about possible attributes. + // Hover to view descriptions of existing attributes. + // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 + "version": "0.2.0", + "configurations": [ + { + "name": "C++ Launch", + "type": "cppdbg", + "request": "launch", + "program": "/home/tangjin/KuiperLLama/build/demo/llama_infer", // 可执行文件路径 + "args": [], // 可选参数 + "stopAtEntry": false, + "cwd": "${workspaceFolder}", + "environment": [], + "externalConsole": false, + "MIMode": "gdb", + "miDebuggerPath": "/usr/bin/gdb", // 调试器路径,可以是gdb或lldb + "setupCommands": [ + { + "description": "Enable pretty-printing for gdb", + "text": "-enable-pretty-printing", + "ignoreFailures": true + } + ], + "preLaunchTask": "build", // 构建任务名称, 可选 + "internalConsoleOptions": "openOnSessionStart" + } + ] +} \ No newline at end of file diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..565c0e0 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,118 @@ + +{ + + "files.associations": { + "cctype": "cpp", + "cmath": "cpp", + "cstddef": "cpp", + "cstdio": "cpp", + "cstdlib": "cpp", + "cstring": "cpp", + "ctime": "cpp", + "cwchar": "cpp", + "cwctype": "cpp", + "array": "cpp", + "atomic": "cpp", + "bit": "cpp", + "*.tcc": "cpp", + "bitset": "cpp", + "chrono": "cpp", + "compare": "cpp", + "concepts": "cpp", + "cstdint": "cpp", + "unordered_map": "cpp", + "vector": "cpp", + "exception": "cpp", + "algorithm": "cpp", + "functional": "cpp", + "iterator": "cpp", + "memory": "cpp", + "memory_resource": "cpp", + "random": "cpp", + "ratio": "cpp", + "string": "cpp", + "string_view": "cpp", + "tuple": "cpp", + "type_traits": "cpp", + "utility": "cpp", + "initializer_list": "cpp", + "iosfwd": "cpp", + "istream": "cpp", + "limits": "cpp", + "new": "cpp", + "ostream": "cpp", + "ranges": "cpp", + "sstream": "cpp", + "stdexcept": "cpp", + "streambuf": "cpp", + "thread": "cpp", + "typeinfo": "cpp", + "__nullptr": "cpp", + "iostream": "cpp", + "cstdarg": "cpp", + "clocale": "cpp", + "complex": "cpp", + "condition_variable": "cpp", + "deque": "cpp", + "list": "cpp", + "map": "cpp", + "set": "cpp", + "fstream": "cpp", + "iomanip": "cpp", + "mutex": "cpp", + "numbers": "cpp", + "numeric": "cpp", + "optional": "cpp", + "semaphore": "cpp", + "stop_token": "cpp", + "system_error": "cpp", + "typeindex": "cpp", + "variant": "cpp", + "filesystem": "cpp", + "any": "cpp", + "regex": "cpp", + "unordered_set": "cpp", + "*.inc": "cpp", + "*.ipp": "cpp", + "span": "cpp", + "__node_handle": "cpp", + "__split_buffer": "cpp", + "queue": "cpp", + "stack": "cpp", + "shared_mutex": "cpp", + "__mutex_base": "cpp", + "valarray": "cpp", + "__hash_table": "cpp", + "__tree": "cpp", + "__locale": "cpp", + "__string": "cpp", + "forward_list": "cpp", + "__bit_reference": "cpp", + "csetjmp": "cpp", + "csignal": "cpp", + "strstream": "cpp", + "cfenv": "cpp", + "cinttypes": "cpp", + "codecvt": "cpp", + "source_location": "cpp", + "future": "cpp", + "barrier": "cpp", + "charconv": "cpp", + "coroutine": "cpp", + "cuchar": "cpp", + "latch": "cpp", + "scoped_allocator": "cpp", + "syncstream": "cpp", + "hash_map": "cpp", + "hash_set": "cpp", + "__config": "cpp", + "__tuple": "cpp", + "ios": "cpp" + }, + "C_Cpp.errorSquiggles": "disabled", + "window.zoomLevel": 3, + "window.zoomPerWindow": false, + "testMate.cpp.test.executables": "/home/tangjin/KuiperLLama/build/*/*", + "testMate.cpp.test.workingDirectory": "${absDirpath}", + "cmake.ctest.testExplorerIntegrationEnabled": true +} \ No newline at end of file diff --git a/.vscode/task.json b/.vscode/task.json new file mode 100644 index 0000000..ca5fa8b --- /dev/null +++ b/.vscode/task.json @@ -0,0 +1,16 @@ +{ + "version": "2.0.0", + "tasks": [ + { + "type": "cmake", + "label": "build", + "command": "build", + "targets": [ + "all" + ], + "group": "build", + "problemMatcher": [], + "detail": "CMake template build task" + } + ] +} \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt index ae718eb..e33ec4c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -91,7 +91,8 @@ find_package(Armadillo REQUIRED) aux_source_directory(kuiper/source/tensor/ DIR_TENSOR) aux_source_directory(kuiper/source/base/ DIR_BASE) aux_source_directory(kuiper/source/op/ DIR_OP) -aux_source_directory(kuiper/source/model/ DIR_MODEL) +file(GLOB DIR_MODEL CONFIGURE_DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/kuiper/source/model/*.cpp") +list(REMOVE_ITEM DIR_MODEL "${CMAKE_CURRENT_SOURCE_DIR}/kuiper/source/model/model_paged.cpp") aux_source_directory(kuiper/source/op/kernels/cpu DIR_KERNEL_CPU) aux_source_directory(kuiper/source/op/kernels/cuda DIR_KERNEL_CUDA) aux_source_directory(kuiper/source/op/kernels/ DIR_KERNEL) diff --git a/demo/CMakeLists.txt b/demo/CMakeLists.txt index efaab22..859b53b 100644 --- a/demo/CMakeLists.txt +++ b/demo/CMakeLists.txt @@ -11,6 +11,39 @@ if (LLAMA3_SUPPORT) endif () set_target_properties(llama_infer PROPERTIES CUDA_SEPARABLE_COMPILATION ON) +add_executable(llama_infer_paged mainpaged.cpp) +target_link_directories(llama_infer_paged PUBLIC ${PROJECT_SOURCE_DIR}/lib) +target_link_libraries(llama_infer_paged llama) +if (LLAMA3_SUPPORT) + find_package(absl REQUIRED) + find_package(re2 REQUIRED) + find_package(nlohmann_json REQUIRED) + target_link_libraries(llama_infer_paged absl::base re2::re2 nlohmann_json::nlohmann_json) +endif () +set_target_properties(llama_infer_paged PROPERTIES CUDA_SEPARABLE_COMPILATION ON) + +add_executable(llama_infer_unified main_unified.cpp) +target_link_directories(llama_infer_unified PUBLIC ${PROJECT_SOURCE_DIR}/lib) +target_link_libraries(llama_infer_unified llama) +if (LLAMA3_SUPPORT) + find_package(absl REQUIRED) + find_package(re2 REQUIRED) + find_package(nlohmann_json REQUIRED) + target_link_libraries(llama_infer_unified absl::base re2::re2 nlohmann_json::nlohmann_json) +endif () +set_target_properties(llama_infer_unified PROPERTIES CUDA_SEPARABLE_COMPILATION ON) + +add_executable(llama_infer_continuous main_continuous.cpp) +target_link_directories(llama_infer_continuous PUBLIC ${PROJECT_SOURCE_DIR}/lib) +target_link_libraries(llama_infer_continuous llama) +if (LLAMA3_SUPPORT) + find_package(absl REQUIRED) + find_package(re2 REQUIRED) + find_package(nlohmann_json REQUIRED) + target_link_libraries(llama_infer_continuous absl::base re2::re2 nlohmann_json::nlohmann_json) +endif () +set_target_properties(llama_infer_continuous PROPERTIES CUDA_SEPARABLE_COMPILATION ON) + if (QWEN2_SUPPORT) message(STATUS "LINK QWEN2 SUPPORT") add_executable(qwen_infer main_qwen.cpp) diff --git a/demo/main_continuous.cpp b/demo/main_continuous.cpp new file mode 100644 index 0000000..28ba219 --- /dev/null +++ b/demo/main_continuous.cpp @@ -0,0 +1,352 @@ +#include +#include +#include +#include +#include +#include +#include +#include "base/batch_metadata.h" +#include "base/kv_cache_manager.h" +#include "base/scheduler.h" +#include "base/scheduler_config.h" +#include "base/sequence.h" +#include "model/llama3.h" +#include "model/config.h" + +void get_gpu_memory_info(size_t* free, size_t* total) { + cudaError_t status = cudaMemGetInfo(free, total); + if (status != cudaSuccess) { + std::cerr << "Error: cudaMemGetInfo failed with " << cudaGetErrorString(status) << std::endl; + return; + } + std::cout << ">>> [GPU] Free Memory: " << *free / (1024 * 1024) << " MB" << std::endl; + std::cout << ">>> [GPU] Total Memory: " << *total / (1024 * 1024) << " MB" << std::endl; +} + +int main(int argc, char* argv[]) { + if (argc < 3) { + std::cout << "Usage: ./llama_infer_continuous [num_seqs] [max_batched_tokens]" << std::endl; + std::cout << " num_seqs: number of parallel sequences (default: 4)" << std::endl; + std::cout << " max_batched_tokens: max tokens per iteration (default: 512)" << std::endl; + return -1; + } + const char* checkpoint_path = argv[1]; + const char* tokenizer_path = argv[2]; + int32_t num_seqs = (argc > 3) ? std::atoi(argv[3]) : 4; + int32_t max_batched_tokens = (argc > 4) ? std::atoi(argv[4]) : 512; + + // ========================================== + // 1. Initialize model + // ========================================== + std::cout << ">>> [Init] Loading model from " << checkpoint_path << std::endl; + + model::LLama2Model model(base::TokenizerType::kEncodeSpe, tokenizer_path, + checkpoint_path, false); + + auto init_status = model.init(base::DeviceType::kDeviceCUDA); + if (!init_status) { + std::cerr << "Model init failed: " << init_status.get_err_code() << std::endl; + return -1; + } + + std::cout << ">>> [Init] Model loaded successfully" << std::endl; + + // ========================================== + // 2. Get model config + // ========================================== + const auto* config = model.config(); + if (!config) { + std::cerr << "Error: Failed to get model config" << std::endl; + return -1; + } + + const int32_t head_dim = config->head_size_; + const int32_t num_kv_heads = config->kv_head_num_; + const int32_t num_layers = config->layer_num_; + const int32_t dim = config->dim_; + const int32_t seq_len = config->seq_len_; + const int32_t kv_dim = num_kv_heads * head_dim; + + std::cout << "\n>>> [Model Config]" << std::endl; + std::cout << " dim : " << dim << std::endl; + std::cout << " num_layers : " << num_layers << std::endl; + std::cout << " num_kv_heads : " << num_kv_heads << std::endl; + std::cout << " head_dim : " << head_dim << std::endl; + std::cout << " seq_len : " << seq_len << std::endl; + + // ========================================== + // 3. Prepare input sentences + // ========================================== + std::vector sentences = { + "Once upon a time, in a land far away,", + "The quick brown fox jumps over the lazy dog.", + "In the beginning, there was nothing but darkness.", + "A long time ago in a galaxy far, far away,", + "It was the best of times, it was the worst of times.", + "To be or not to be, that is the question.", + "All happy families are alike; each unhappy family is unhappy in its own way.", + "Call me Ishmael. Some years ago, never mind how long precisely,", + "Man, What can i say?", + "What are you doing, Ouch", + "Once upon a time, there was a mountain, and on the mountain, there was a temple.", + "It's been a long day without you, my friend.", + "Youth is not a time of life; it is a state of mind.", + }; + + if (num_seqs < static_cast(sentences.size())) { + sentences.resize(num_seqs); + } + int32_t actual_num_seqs = static_cast(sentences.size()); + + std::cout << "\n>>> [Input] Will process " << actual_num_seqs << " sequences" << std::endl; + + // ========================================== + // 4. Initialize Paged KV Cache + // ========================================== + const int block_size = 16; + + size_t free_mem, total_mem; + get_gpu_memory_info(&free_mem, &total_mem); + + // Calculate activation memory based on model config + // max_seq_len for activation = max_batched_tokens (chunked prefill limits this) + int32_t max_activation_seq_len = max_batched_tokens; + int32_t hidden_dim = config->hidden_dim_; // FFN intermediate dimension + + size_t activation_peak = 0; + // 1. Hidden states + activation_peak += (size_t)max_activation_seq_len * dim * sizeof(float); + // 2. Q/K/V projection tensors + activation_peak += (size_t)max_activation_seq_len * dim * sizeof(float); // Q + activation_peak += (size_t)max_activation_seq_len * kv_dim * sizeof(float); // K + activation_peak += (size_t)max_activation_seq_len * kv_dim * sizeof(float); // V + // 3. Attention output + activation_peak += (size_t)max_activation_seq_len * dim * sizeof(float); + // 4. FFN intermediate (SwiGLU has gate and up branches) + activation_peak += (size_t)2 * max_activation_seq_len * hidden_dim * sizeof(float); + // 5. Misc buffers (softmax, layernorm, etc.) + activation_peak += (size_t)max_activation_seq_len * dim * sizeof(float); + + // Add safety margin (20%) + size_t reserved_memory = (size_t)(activation_peak * 1.2); + + size_t one_block_bytes = (size_t)block_size * num_layers * 2 * kv_dim * sizeof(float); + size_t kv_cache_pool_size = (size_t)(free_mem * 0.9) - reserved_memory; + int total_blocks = kv_cache_pool_size / one_block_bytes; + int max_blocks_per_seq = (seq_len + block_size - 1) / block_size; + + std::cout << "\n>>> [Memory Plan]" << std::endl; + std::cout << " Activation Peak : " << activation_peak / 1024 / 1024 << " MB" << std::endl; + std::cout << " Reserved Memory : " << reserved_memory / 1024 / 1024 << " MB" << std::endl; + std::cout << " KV Cache Pool : " << kv_cache_pool_size / 1024 / 1024 << " MB" << std::endl; + std::cout << " One Block Size : " << one_block_bytes / 1024 << " KB" << std::endl; + std::cout << " Total Blocks : " << total_blocks << std::endl; + std::cout << " Max Batched Tokens : " << max_batched_tokens << std::endl; + + auto kv_init_status = model.init_paged_kv_cache(actual_num_seqs, block_size, total_blocks, max_blocks_per_seq); + if (!kv_init_status) { + std::cerr << "Error: Failed to initialize paged KV cache" << std::endl; + return -1; + } + + auto* kv_manager = model.get_kv_cache_manager(); + + std::cout << ">>> [Init] Paged KV Cache initialized" << std::endl; + + // ========================================== + // 5. Initialize Scheduler + // ========================================== + base::SchedulerConfig sched_config; + sched_config.max_num_batched_tokens = max_batched_tokens; + sched_config.max_num_seqs = actual_num_seqs; + sched_config.max_model_len = seq_len; + sched_config.max_prefill_tokens = 128; // Chunk size for prefill + sched_config.block_size = block_size; + sched_config.min_free_blocks = 8; + sched_config.decode_priority = true; + + base::Scheduler scheduler(sched_config); + scheduler.set_kv_cache_manager(kv_manager); + + // Add all requests to scheduler + int32_t max_new_tokens = 50; + std::cout << "\n>>> [Input] Encoding and adding " << actual_num_seqs << " sequences:" << std::endl; + for (int32_t i = 0; i < actual_num_seqs; ++i) { + auto prompt_tokens = model.encode(sentences[i]); + if (prompt_tokens.empty()) { + std::cerr << "Error: Failed to encode sentence " << i << std::endl; + return -1; + } + scheduler.add_request(prompt_tokens, max_new_tokens, sentences[i]); + std::cout << " Seq " << i << ": " << prompt_tokens.size() << " tokens - \"" + << sentences[i].substr(0, 40) << "...\"" << std::endl; + } + + // ========================================== + // 6. Continuous Batching Main Loop + // ========================================== + std::cout << "\n>>> [Inference] Starting continuous batching with scheduler..." << std::endl; + std::cout << " Decode Priority: " << (sched_config.decode_priority ? "ON" : "OFF") << std::endl; + + int total_iterations = 0; + int total_prefill_tokens = 0; + int total_decode_tokens = 0; + + auto start_time = std::chrono::steady_clock::now(); + + // Get allocators for BatchMetadata + auto alloc_gpu = base::CUDADeviceAllocatorFactory::get_instance(); + auto alloc_cpu = base::CPUDeviceAllocatorFactory::get_instance(); + + while (!scheduler.all_finished()) { + // Get scheduling decision + auto sched_output = scheduler.schedule(); + if (!sched_output.has_work()) { + std::cout << " [Done] No more work to schedule" << std::endl; + break; + } + + // Log preempted sequences + if (!sched_output.preempted_seq_ids.empty()) { + std::cout << " [Preemption] Evicted " << sched_output.preempted_seq_ids.size() + << " sequences due to memory pressure: "; + for (int32_t pid : sched_output.preempted_seq_ids) { + std::cout << pid << " "; + } + std::cout << std::endl; + } + + // Build batch data from scheduler output + std::vector batch_seq_ids; + std::vector batch_token_counts; + std::vector batch_start_positions; + std::vector batch_context_lens; + std::vector batch_is_prefill; + std::vector all_tokens; + + for (const auto& sched_seq : sched_output.scheduled_seqs) { + batch_seq_ids.push_back(sched_seq.kv_slot); // Use kv_slot for KV cache indexing + batch_token_counts.push_back(sched_seq.num_tokens); + batch_start_positions.push_back(sched_seq.start_pos); + batch_context_lens.push_back(sched_seq.context_len); + batch_is_prefill.push_back(sched_seq.is_prefill); + all_tokens.insert(all_tokens.end(), sched_seq.tokens.begin(), sched_seq.tokens.end()); + } + + // Allocate additional blocks for decode sequences + for (const auto& sched_seq : sched_output.scheduled_seqs) { + if (!sched_seq.is_prefill) { + kv_manager->allocate_blocks_for_tokens(sched_seq.kv_slot, sched_seq.context_len); + } + } + + // Get embeddings + const auto& embedding_output = model.embedding(all_tokens); + auto [tok_ids, tok_emb, tok_num] = embedding_output; + + // Create mixed batch metadata + auto batch_meta = base::BatchMetadata::create_mixed( + batch_seq_ids, + batch_token_counts, + batch_start_positions, + batch_context_lens, + batch_is_prefill, + alloc_gpu, + alloc_cpu, + nullptr); + + // Forward pass + std::vector next_tokens; + auto forward_status = model.forward_unified(tok_emb, batch_meta, next_tokens); + if (!forward_status) { + std::cerr << "Error: forward_unified failed: " << forward_status.get_err_msg() << std::endl; + break; + } + + // Update scheduler state + scheduler.update_after_forward(sched_output, next_tokens); + + // Check for EOS and max tokens + for (size_t i = 0; i < sched_output.scheduled_seqs.size(); ++i) { + const auto& sched_seq = sched_output.scheduled_seqs[i]; + const auto* seq = scheduler.get_sequence(sched_seq.seq_id); + if (!seq) continue; + + // Check EOS + if (i < next_tokens.size() && model.is_sentence_ending(next_tokens[i])) { + scheduler.finish_sequence(sched_seq.seq_id); + continue; + } + + // Check max tokens (only for decode phase) + if (!sched_seq.is_prefill && seq->num_generated() >= max_new_tokens) { + scheduler.finish_sequence(sched_seq.seq_id); + } + } + + // Log prefill completion + for (const auto& sched_seq : sched_output.scheduled_seqs) { + if (sched_seq.is_prefill) { + const auto* seq = scheduler.get_sequence(sched_seq.seq_id); + if (seq && !seq->is_prefill()) { + std::cout << " [Prefill Done] Seq " << sched_seq.seq_id + << " completed prefill, starting decode" << std::endl; + } + } + } + + total_prefill_tokens += sched_output.num_prefill_tokens; + total_decode_tokens += sched_output.num_decode_tokens; + total_iterations++; + + // Progress logging every 10 iterations + if (total_iterations % 10 == 0) { + std::cout << " [Iter " << total_iterations << "] " + << "Batch: " << sched_output.num_tokens << " tokens " + << "(prefill: " << sched_output.num_prefill_tokens + << ", decode: " << sched_output.num_decode_tokens << ") " + << "| Waiting: " << scheduler.num_waiting() + << ", Running: " << scheduler.num_running() + << ", Preempted: " << scheduler.num_preempted() + << ", Finished: " << scheduler.num_finished() + << std::endl; + } + } + + auto end_time = std::chrono::steady_clock::now(); + double total_ms = std::chrono::duration_cast>( + end_time - start_time).count(); + + // ========================================== + // 7. Print generated text for each sequence + // ========================================== + std::cout << "\n>>> [Generated Text]" << std::endl; + auto all_seqs = scheduler.get_all_sequences(); + for (const auto* seq : all_seqs) { + std::string generated_text = model.decode(seq->output_tokens); + std::string prompt_preview = seq->original_text.substr(0, 30); + std::cout << " Seq " << seq->seq_id << " (" << seq->output_tokens.size() << " tokens): " + << prompt_preview << "... " << generated_text << std::endl; + } + + // ========================================== + // 8. Statistics + // ========================================== + std::cout << "\n>>> [Statistics]" << std::endl; + std::cout << " Num sequences : " << actual_num_seqs << std::endl; + std::cout << " Total iterations : " << total_iterations << std::endl; + std::cout << " Total prefill tok : " << total_prefill_tokens << std::endl; + std::cout << " Total decode tok : " << total_decode_tokens << std::endl; + std::cout << " Total time : " << total_ms << " ms" << std::endl; + std::cout << " Overall throughput : " << ((total_prefill_tokens + total_decode_tokens) * 1000.0 / total_ms) + << " tokens/sec" << std::endl; + + std::cout << "\n>>> [Scheduler Config]" << std::endl; + std::cout << " max_num_batched_tokens : " << sched_config.max_num_batched_tokens << std::endl; + std::cout << " max_prefill_tokens : " << sched_config.max_prefill_tokens << std::endl; + std::cout << " decode_priority : " << (sched_config.decode_priority ? "true" : "false") << std::endl; + + std::cout << "\n>>> [Success] Continuous batching with scheduler completed!" << std::endl; + + return 0; +} \ No newline at end of file diff --git a/demo/main_unified.cpp b/demo/main_unified.cpp new file mode 100644 index 0000000..5d0a969 --- /dev/null +++ b/demo/main_unified.cpp @@ -0,0 +1,308 @@ +#include +#include +#include +#include +#include +#include +#include +#include "base/batch_metadata.h" +#include "base/kv_cache_manager.h" +#include "model/llama3.h" +#include "model/config.h" + +void get_gpu_memory_info(size_t* free, size_t* total) { + cudaError_t status = cudaMemGetInfo(free, total); + if (status != cudaSuccess) { + std::cerr << "Error: cudaMemGetInfo failed with " << cudaGetErrorString(status) << std::endl; + return; + } + std::cout << ">>> [GPU] Free Memory: " << *free / (1024 * 1024) << " MB" << std::endl; + std::cout << ">>> [GPU] Total Memory: " << *total / (1024 * 1024) << " MB" << std::endl; +} + +int main(int argc, char* argv[]) { + if (argc < 3) { + std::cout << "Usage: ./llama_infer_unified [num_seqs] [chunk_size]" << std::endl; + std::cout << " num_seqs: number of parallel sequences (default: 4)" << std::endl; + std::cout << " chunk_size: optional, 0 for auto (default: 0)" << std::endl; + return -1; + } + const char* checkpoint_path = argv[1]; + const char* tokenizer_path = argv[2]; + int32_t num_seqs = (argc > 3) ? std::atoi(argv[3]) : 4; + int32_t chunk_size = (argc > 4) ? std::atoi(argv[4]) : 0; + + // ========================================== + // 1. Initialie model + // ========================================== + std::cout << ">>> [Init] Loading model from " << checkpoint_path << std::endl; + + model::LLama2Model model(base::TokenizerType::kEncodeSpe, tokenizer_path, + checkpoint_path, false); + + auto init_status = model.init(base::DeviceType::kDeviceCUDA); + if (!init_status) { + std::cerr << "Model init failed: " << init_status.get_err_code() << std::endl; + return -1; + } + + std::cout << ">>> [Init] Model loaded successfully" << std::endl; + + // ========================================== + // 2. Get model config + // ========================================== + const auto* config = model.config(); + if (!config) { + std::cerr << "Error: Failed to get model config" << std::endl; + return -1; + } + + const int32_t head_dim = config->head_size_; + const int32_t num_kv_heads = config->kv_head_num_; + const int32_t num_heads = config->head_num_; + const int32_t num_layers = config->layer_num_; + const int32_t dim = config->dim_; + const int32_t hidden_dim = config->hidden_dim_; + const int32_t seq_len = config->seq_len_; + const int32_t kv_dim = num_kv_heads * head_dim; + + std::cout << "\n>>> [Model Config]" << std::endl; + std::cout << " dim : " << dim << std::endl; + std::cout << " hidden_dim : " << hidden_dim << std::endl; + std::cout << " num_layers : " << num_layers << std::endl; + std::cout << " num_heads : " << num_heads << std::endl; + std::cout << " num_kv_heads : " << num_kv_heads << std::endl; + std::cout << " head_dim : " << head_dim << std::endl; + std::cout << " seq_len : " << seq_len << std::endl; + + // ========================================== + // 3. Prepare input sentences (multiple sequences) + // ========================================== + std::vector sentences = { + "Once upon a time, in a land far away,", + "The quick brown fox jumps over the lazy dog.", + "In the beginning, there was nothing but darkness.", + "A long time ago in a galaxy far, far away,", + "It was the best of times, it was the worst of times.", + "To be or not to be, that is the question.", + "All happy families are alike; each unhappy family is unhappy in its own way.", + "Call me Ishmael. Some years ago, never mind how long precisely,", + "Man, What can i say?", + "What are you doing, Ouch" , + "Once upon a time, there was a mountain, and on the mountain, there was a temple.", + "It's been a long day without you, my friend.", + "Youth is not a time of life; it is a state of mind;it is not a matter of rosy cheeks, red lips and supple knees;it is a matter of the will, a quality of the imagination, a vigor of the emotions", + + }; + + // Limit to requested number of sequences + if (num_seqs < static_cast(sentences.size())) { + sentences.resize(num_seqs); + } + int32_t actual_num_seqs = static_cast(sentences.size()); + + std::cout << "\n>>> [Input] Encoding " << actual_num_seqs << " sequences:" << std::endl; + + std::vector> all_tokens(actual_num_seqs); + int32_t max_prompt_len = 0; + for (int32_t i = 0; i < actual_num_seqs; ++i) { + all_tokens[i] = model.encode(sentences[i]); + if (all_tokens[i].empty()) { + std::cerr << "Error: Failed to encode sentence " << i << std::endl; + return -1; + } + max_prompt_len = std::max(max_prompt_len, static_cast(all_tokens[i].size())); + std::cout << " Seq " << i << ": " << all_tokens[i].size() << " tokens - \"" + << sentences[i].substr(0, 40) << "...\"" << std::endl; + } + + // ========================================== + // 4. Initialize Paged KV Cache + // ========================================== + const int block_size = 16; + + size_t free_mem, total_mem; + get_gpu_memory_info(&free_mem, &total_mem); + + // Calculate available blocks + size_t one_block_bytes = (size_t)block_size * num_layers * 2 * kv_dim * sizeof(float); + size_t reserved_memory = 512UL * 1024 * 1024; // 512MB for activations + size_t kv_cache_pool_size = (size_t)(free_mem * 0.9) - reserved_memory; + int total_blocks = kv_cache_pool_size / one_block_bytes; + int max_blocks_per_seq = (seq_len + block_size - 1) / block_size; + + std::cout << "\n>>> [Memory Plan]" << std::endl; + std::cout << " KV Cache Pool : " << kv_cache_pool_size / 1024 / 1024 << " MB" << std::endl; + std::cout << " Total Blocks : " << total_blocks << std::endl; + std::cout << " Max Context : " << total_blocks * block_size << " Tokens" << std::endl; + + auto kv_init_status = model.init_paged_kv_cache(actual_num_seqs, block_size, total_blocks, max_blocks_per_seq); + if (!kv_init_status) { + std::cerr << "Error: Failed to initialize paged KV cache" << std::endl; + return -1; + } + + std::cout << ">>> [Init] Paged KV Cache initialized for " << actual_num_seqs << " sequences" << std::endl; + + // ========================================== + // 5. Chunked Prefill for all sequences + // ========================================== + std::cout << "\n>>> [Inference] Starting chunked prefill for all sequences..." << std::endl; + + auto* kv_manager = model.get_kv_cache_manager(); + std::vector next_tokens(actual_num_seqs, -1); + std::vector prompt_lens(actual_num_seqs); + + auto prefill_start = std::chrono::steady_clock::now(); + int32_t total_prefill_tokens = 0; + + for (int32_t seq_id = 0; seq_id < actual_num_seqs; ++seq_id) { + const auto& tokens = all_tokens[seq_id]; + prompt_lens[seq_id] = static_cast(tokens.size()); + total_prefill_tokens += prompt_lens[seq_id]; + + // Get embeddings + const auto& prompt_embedding = model.embedding(tokens); + auto [input_tokens, input_embeddings, input_token_num] = prompt_embedding; + + // Compute chunk size if auto + int32_t seq_chunk_size = chunk_size; + if (seq_chunk_size <= 0) { + seq_chunk_size = model.compute_chunk_size(prompt_lens[seq_id], actual_num_seqs); + } + + // Allocate blocks for the entire prompt + if (!kv_manager->allocate_blocks_for_tokens(seq_id, prompt_lens[seq_id])) { + std::cerr << "Error: Failed to allocate KV cache blocks for seq " << seq_id << std::endl; + return -1; + } + + // Chunked prefill + auto prefill_status = model.forward_chunked_prefill( + seq_id, + input_embeddings, + 0, // start_pos + seq_chunk_size, + next_tokens[seq_id]); + + if (!prefill_status) { + std::cerr << "Error: Chunked prefill failed for seq " << seq_id << ": " + << prefill_status.get_err_msg() << std::endl; + return -1; + } + } + + auto prefill_end = std::chrono::steady_clock::now(); + double prefill_ms = std::chrono::duration_cast>( + prefill_end - prefill_start).count(); + + std::cout << " [Prefill] Processed " << total_prefill_tokens << " tokens across " + << actual_num_seqs << " sequences in " << prefill_ms << " ms" << std::endl; + std::cout << " [Prefill] Throughput: " << (total_prefill_tokens * 1000.0 / prefill_ms) + << " tokens/sec" << std::endl; + + // ========================================== + // 6. Batch Decode phase + // ========================================== + std::cout << "\n>>> [Decode] Generating tokens (batch decode)..." << std::endl; + + std::vector> generated_tokens(actual_num_seqs); + std::vector positions(actual_num_seqs); + std::vector finished(actual_num_seqs, false); + + for (int32_t i = 0; i < actual_num_seqs; ++i) { + positions[i] = prompt_lens[i]; + } + + int max_new_tokens = 50; + int total_generated = 0; + + auto decode_start = std::chrono::steady_clock::now(); + + for (int step = 0; step < max_new_tokens; ++step) { + // Collect active sequences + std::vector active_seq_ids; + std::vector active_positions; + std::vector active_tokens; + + for (int32_t i = 0; i < actual_num_seqs; ++i) { + if (!finished[i]) { + // Check for end of sentence + if (model.is_sentence_ending(next_tokens[i])) { + finished[i] = true; + continue; + } + + generated_tokens[i].push_back(next_tokens[i]); + active_seq_ids.push_back(i); + active_positions.push_back(positions[i]); + active_tokens.push_back(next_tokens[i]); + } + } + + if (active_seq_ids.empty()) { + std::cout << " [Decode] All sequences finished" << std::endl; + break; + } + + // Get embeddings for all active tokens + const auto& token_embedding = model.embedding(active_tokens); + auto [tok_ids, tok_emb, tok_num] = token_embedding; + + // Allocate blocks for new tokens + for (size_t i = 0; i < active_seq_ids.size(); ++i) { + int32_t seq_id = active_seq_ids[i]; + kv_manager->allocate_blocks_for_tokens(seq_id, positions[seq_id] + 1); + } + + // Batch decode + std::vector batch_next_tokens; + auto decode_status = model.forward_decode(active_seq_ids, active_positions, tok_emb, batch_next_tokens); + if (!decode_status) { + std::cerr << "Error: Batch decode failed: " << decode_status.get_err_msg() << std::endl; + break; + } + + // Update next tokens and positions + for (size_t i = 0; i < active_seq_ids.size(); ++i) { + int32_t seq_id = active_seq_ids[i]; + if (i < batch_next_tokens.size()) { + next_tokens[seq_id] = batch_next_tokens[i]; + } + kv_manager->set_context_len(seq_id, positions[seq_id] + 1); + positions[seq_id] += 1; + total_generated++; + } + } + + auto decode_end = std::chrono::steady_clock::now(); + double decode_ms = std::chrono::duration_cast>( + decode_end - decode_start).count(); + + // ========================================== + // 7. Print generated text for each sequence + // ========================================== + std::cout << "\n>>> [Generated Text]" << std::endl; + for (int32_t i = 0; i < actual_num_seqs; ++i) { + std::string generated_text = model.decode(generated_tokens[i]); + std::cout << " Seq " << i << " (" << generated_tokens[i].size() << " tokens): " + << sentences[i].substr(0, 30) << "... " << generated_text << std::endl; + } + + // ========================================== + // 8. Statistics + // ========================================== + std::cout << "\n>>> [Statistics]" << std::endl; + std::cout << " Num sequences : " << actual_num_seqs << std::endl; + std::cout << " Total prompt : " << total_prefill_tokens << " tokens" << std::endl; + std::cout << " Total generated : " << total_generated << " tokens" << std::endl; + std::cout << " Decode time : " << decode_ms << " ms" << std::endl; + if (total_generated > 0) { + std::cout << " Decode throughput: " << (total_generated * 1000.0 / decode_ms) + << " tokens/sec" << std::endl; + } + + std::cout << "\n>>> [Success] Multi-sequence unified forward inference completed!" << std::endl; + + return 0; +} \ No newline at end of file diff --git a/demo/mainpaged.cpp b/demo/mainpaged.cpp new file mode 100644 index 0000000..7703979 --- /dev/null +++ b/demo/mainpaged.cpp @@ -0,0 +1,269 @@ +#include +#include +#include +#include +#include +#include +#include "base/tick.h" +#include "base/block_allocator.h" +#include "base/kv_cache_manager.h" +#include "model/llama3.h" +#include "model/config.h" + +void get_gpu_memory_info(size_t* free, size_t* total) { + cudaError_t status = cudaMemGetInfo(free, total); + if (status != cudaSuccess) { + std::cerr << "Error: cudaMemGetInfo failed with " << cudaGetErrorString(status) << std::endl; + return; + } + std::cout << ">>> [GPU] Free Memory: " << *free / (1024 * 1024) << " MB" << std::endl; + std::cout << ">>> [GPU] Total Memory: " << *total / (1024 * 1024) << " MB" << std::endl; +} + +int main(int argc, char* argv[]) { + if (argc != 3) { + std::cout << "Usage: ./llama_infer_paged " << std::endl; + return -1; + } + const char* checkpoint_path = argv[1]; + const char* tokenizer_path = argv[2]; + + // ========================================== + // 1. 初始化模型 + // ========================================== + std::cout << ">>> [Init] Loading model from " << checkpoint_path << std::endl; + + // 创建模型实例 + model::LLama2Model model(base::TokenizerType::kEncodeSpe, tokenizer_path, + checkpoint_path, false); + + auto init_status = model.init(base::DeviceType::kDeviceCUDA); + if (!init_status) { + std::cerr << "Model init failed: " << init_status.get_err_code() << std::endl; + return -1; + } + + std::cout << ">>> [Init] Model loaded successfully" << std::endl; + + // ========================================== + // 2. 从模型配置获取参数 + // ========================================== + const auto* config = model.config(); + if (!config) { + std::cerr << "Error: Failed to get model config" << std::endl; + return -1; + } + + const int32_t head_dim = config->head_size_; + const int32_t num_kv_heads = config->kv_head_num_; + const int32_t num_heads = config->head_num_; + const int32_t num_layers = config->layer_num_; + const int32_t dim = config->dim_; + const int32_t hidden_dim = config->hidden_dim_; + const int32_t seq_len = config->seq_len_; + + std::cout << "\n>>> [Model Config]" << std::endl; + std::cout << " dim : " << dim << std::endl; + std::cout << " hidden_dim : " << hidden_dim << std::endl; + std::cout << " num_layers : " << num_layers << std::endl; + std::cout << " num_heads : " << num_heads << std::endl; + std::cout << " num_kv_heads : " << num_kv_heads << std::endl; + std::cout << " head_dim : " << head_dim << std::endl; + std::cout << " seq_len : " << seq_len << std::endl; + + // ========================================== + // 3. 准备输入句子并编码 + // ========================================== + const std::string sentence = "Once upon a time, in a land far away,"; + std::cout << "\n>>> [Input] Encoding sentence: \"" << sentence << "\"" << std::endl; + + auto tokens = model.encode(sentence); + if (tokens.empty()) { + std::cerr << "Error: Failed to encode sentence" << std::endl; + return -1; + } + + std::cout << " [Encode] Token count: " << tokens.size() << std::endl; + + // ========================================== + // 4. GPU 内存规划和 Paged KV Cache 初始化 + // ========================================== + std::cout << "\n>>> [Memory Planning] Calculating KV cache allocation..." << std::endl; + + const int block_size = 16; // 每个物理块包含的 token 数量 + + // 4.1 预留显存 (Activation + Overhead) + const int max_seq_len = seq_len; // 使用模型配置的 seq_len + + // Activation memory estimation (Prefill 阶段峰值) + const int32_t kv_dim = num_kv_heads * head_dim; + size_t activation_peak = 0; + + // 1. Hidden states + activation_peak += (size_t)max_seq_len * dim * sizeof(float); + + // 2. Q/K/V 投影中间张量 + activation_peak += (size_t)max_seq_len * dim * sizeof(float); // Q + activation_peak += (size_t)max_seq_len * kv_dim * sizeof(float); // K + activation_peak += (size_t)max_seq_len * kv_dim * sizeof(float); // V + + // 3. Attention scores (O(n²) - Prefill 阶段最大的内存消耗) + activation_peak += (size_t)num_heads * max_seq_len * max_seq_len * sizeof(float); + + // 4. FFN 中间层 (SwiGLU 有 gate 和 up 两个分支) + activation_peak += (size_t)2 * max_seq_len * hidden_dim * sizeof(float); + + // 5. 固定开销 (cuBLAS workspace, 临时 buffer 等) + const size_t fixed_overhead = 256UL * 1024 * 1024; // 256MB + const size_t reserved_memory = activation_peak + fixed_overhead; + + std::cout << " [Profile] Estimated Activation Reserve: " << reserved_memory / 1024 / 1024 << " MB" << std::endl; + + // 4.2 计算实际可用显存 + float gpu_memory_utilization = 0.90; + + size_t free_mem, total_mem; + get_gpu_memory_info(&free_mem, &total_mem); + + size_t available_mem = (size_t)(free_mem * gpu_memory_utilization); + + if (available_mem <= reserved_memory) { + std::cerr << "Error: Not enough memory! Available: " << available_mem / 1024 / 1024 + << "MB, Required: " << reserved_memory / 1024 / 1024 << "MB" << std::endl; + return -1; + } + + size_t kv_cache_pool_size = available_mem - reserved_memory; + + // 计算单个 Block 的物理大小 + // Layout: [max_blocks, num_layers, 2 (K+V), block_size, kv_dim] + size_t one_block_bytes = (size_t)block_size * num_layers * 2 * kv_dim * sizeof(float); + + // 计算 Block 数量 + int total_blocks = kv_cache_pool_size / one_block_bytes; + + std::cout << "\n>>> [Memory Plan]" << "\n" + << " Total GPU Mem : " << total_mem / 1024 / 1024 << " MB\n" + << " Free GPU Mem : " << free_mem / 1024 / 1024 << " MB\n" + << " Reserved Mem : " << reserved_memory / 1024 / 1024 << " MB\n" + << " KV Cache Pool : " << kv_cache_pool_size / 1024 / 1024 << " MB\n" + << " Block Size : " << one_block_bytes / 1024 << " KB\n" + << " >>> Total Blocks: " << total_blocks << "\n" + << " >>> Max Context : " << total_blocks * block_size << " Tokens\n" << std::endl; + + if (total_blocks <= 0) { + std::cerr << "Error: No memory left for KV Cache!" << std::endl; + return -1; + } + + // ========================================== + // 5. 初始化 Paged KV Cache + // ========================================== + const int num_seqs = 1; // 当前只支持单个序列 + int max_blocks_per_seq = (max_seq_len + block_size - 1) / block_size; + + std::cout << ">>> [Init] Initializing Paged KV Cache..." << std::endl; + std::cout << " num_seqs : " << num_seqs << std::endl; + std::cout << " block_size : " << block_size << std::endl; + std::cout << " total_blocks : " << total_blocks << std::endl; + std::cout << " max_blocks_per_seq: " << max_blocks_per_seq << std::endl; + + auto kv_init_status = model.init_paged_kv_cache(num_seqs, block_size, total_blocks, max_blocks_per_seq); + if (!kv_init_status) { + std::cerr << "Error: Failed to initialize paged KV cache: " << kv_init_status.get_err_code() << std::endl; + return -1; + } + + std::cout << ">>> [Init] Paged KV Cache initialized successfully" << std::endl; + + // ========================================== + // 6. 开始推理:Prefill 阶段 + // ========================================== + std::cout << "\n>>> [Inference] Starting generation..." << std::endl; + + int32_t prompt_len = static_cast(tokens.size()); + int32_t pos = 0; + int32_t next = -1; + bool is_prompt = true; + const int total_steps = 128; // 最大生成步数 + + // 获取 prompt embedding + const auto& prompt_embedding = model.embedding(tokens); + tensor::Tensor pos_tensor = model.get_buffer(model::ModelBufferType::kInputPos); + + std::vector generated_tokens; + + // Prefill 阶段:处理 prompt tokens + while (pos < prompt_len - 1) { + pos_tensor.index(0) = pos; + + tensor::Tensor input = model.fill_input(pos_tensor, prompt_embedding, is_prompt); + model.predict(input, pos_tensor, is_prompt, next); + + next = tokens.at(pos + 1); + generated_tokens.push_back(next); + pos += 1; + } + + std::cout << " [Prefill] Processed " << pos << " prompt tokens" << std::endl; + + // ========================================== + // 7. Decode 阶段:逐个生成新 token + // ========================================== + is_prompt = false; + int max_new_tokens = 50; // 最多生成 50 个新 token + + std::cout << "\n>>> [Decode] Generating tokens..." << std::endl; + std::cout << "Generated: "; + fflush(stdout); + + // 用于增量解码:记录上次解码的字符串长度 + std::string prev_decoded = ""; + + for (int step = 0; step < max_new_tokens && pos < total_steps; ++step) { + // 更新位置 + pos_tensor.index(0) = pos; + + // 使用上一步生成的 token 作为输入 + std::vector current_token = {next}; + const auto& token_embedding = model.embedding(current_token); + tensor::Tensor input = model.fill_input(pos_tensor, token_embedding, is_prompt); + + // 执行推理(KVCacheManager 会自动管理块分配) + model.predict(input, pos_tensor, is_prompt, next); + + // 检查是否结束 + if (model.is_sentence_ending(next)) { + std::cout << std::endl; + std::cout << " [Decode] Reached end of sentence token" << std::endl; + break; + } + + // 添加生成的 token + generated_tokens.push_back(next); + + // 增量解码:解码所有已生成的 token,只输出新增部分 + std::string full_decoded = model.decode(generated_tokens); + if (full_decoded.length() > prev_decoded.length()) { + std::cout << full_decoded.substr(prev_decoded.length()); + fflush(stdout); + } + prev_decoded = full_decoded; + + pos += 1; + } + + std::cout << std::endl; + + // ========================================== + // 8. 输出统计信息 + // ========================================== + std::cout << "\n>>> [Statistics]" << std::endl; + std::cout << " Prompt tokens : " << prompt_len << std::endl; + std::cout << " Generated tokens : " << generated_tokens.size() << std::endl; + std::cout << " Total tokens : " << pos + 1 << std::endl; + + std::cout << "\n>>> [Success] Paged attention inference completed!" << std::endl; + + return 0; +} \ No newline at end of file diff --git a/kuiper/include/base/base.h b/kuiper/include/base/base.h index 53ab5d8..a112f48 100644 --- a/kuiper/include/base/base.h +++ b/kuiper/include/base/base.h @@ -26,9 +26,36 @@ enum class ModelBufferType { kFFNRMSNorm = 13, kForwardOutput = 15, kForwardOutputCPU = 16, - kSinCache = 17, kCosCache = 18, + kPagedKVCache = 19, + kPagedBlockTables = 20, + kPagedAttentionContextLens = 21, + // Batch buffers for multi-sequence processing + kBatchQuery = 22, // [num_seqs, dim] + kBatchKey = 23, // [num_seqs, kv_dim] + kBatchValue = 24, // [num_seqs, kv_dim] + kBatchOutputMHA = 25, // [num_seqs, dim] + kBatchAttnOutput = 26, // [num_seqs, dim] + kBatchRMSNormOutput = 27, // [num_seqs, dim] + kBatchW1Output = 28, // [num_seqs, hidden_dim] + kBatchW3Output = 29, // [num_seqs, hidden_dim] + kBatchW2Output = 30, // [num_seqs, dim] + kBatchFFNRMSNorm = 31, // [num_seqs, dim] + kBatchInputEmbeddings = 32, // [num_seqs, dim] + kBatchPositions = 33, // [num_seqs] - positions for each sequence + kBatchForwardOutput = 34, // [num_seqs, vocab_size] - logits for each sequence + // Unified forward buffers (for chunked prefill / continuous batching) + kUnifiedQuery = 35, // [num_tokens, dim] + kUnifiedKey = 36, // [num_tokens, kv_dim] + kUnifiedValue = 37, // [num_tokens, kv_dim] + kUnifiedRMSNormOutput = 38, // [num_tokens, dim] + kUnifiedMHAOutput = 39, // [num_tokens, dim] + kUnifiedAttnOutput = 40, // [num_tokens, dim] + kUnifiedFFNRMSNorm = 41, // [num_tokens, dim] + kUnifiedW1Output = 42, // [num_tokens, hidden_dim] + kUnifiedW3Output = 43, // [num_tokens, hidden_dim] + kUnifiedW2Output = 44, // [num_tokens, dim] }; } diff --git a/kuiper/include/base/batch_metadata.h b/kuiper/include/base/batch_metadata.h new file mode 100644 index 0000000..c93c08c --- /dev/null +++ b/kuiper/include/base/batch_metadata.h @@ -0,0 +1,75 @@ +#ifndef KUIPER_INCLUDE_BASE_BATCH_METADATA_H_ +#define KUIPER_INCLUDE_BASE_BATCH_METADATA_H_ + +#include +#include +#include "base/alloc.h" +#include "tensor/tensor.h" + +namespace base { + +// Unified batch metadata for forward pass +// Supports: single decode, batch decode, chunked prefill, mixed prefill+decode +struct BatchMetadata { + int32_t num_tokens = 0; // Total number of tokens in this batch + int32_t num_seqs = 0; // Number of sequences + + // GPU tensors + tensor::Tensor seq_ids; // [num_tokens] - sequence index for each token + tensor::Tensor positions; // [num_tokens] - position for each token + tensor::Tensor seq_start_locs; // [num_seqs + 1] - start location of each sequence in tokens + + // CPU copies for host-side logic + std::vector seq_ids_cpu; + std::vector positions_cpu; + std::vector seq_start_locs_cpu; + std::vector seq_lens_cpu; // Current context length for each sequence + std::vector is_prefill_cpu; // Whether each sequence is in prefill phase + + // Query methods + bool has_prefill() const; + bool has_decode() const; + int32_t max_prefill_len() const; + int32_t max_context_len() const; + int32_t get_num_prefill_tokens() const; + int32_t get_num_decode_tokens() const; + + // Get the last token index for each sequence (for logits computation) + std::vector get_last_token_indices() const; + + // Factory methods + static BatchMetadata create_decode( + const std::vector& seq_ids, + const std::vector& positions, + const std::vector& context_lens, + std::shared_ptr alloc_gpu, + std::shared_ptr alloc_cpu, + void* stream = nullptr); + + static BatchMetadata create_prefill( + int32_t seq_id, + int32_t start_pos, + int32_t num_tokens, + int32_t context_len, + std::shared_ptr alloc_gpu, + std::shared_ptr alloc_cpu, + void* stream = nullptr); + + static BatchMetadata create_mixed( + const std::vector& seq_ids, + const std::vector& token_counts, // Number of tokens per sequence + const std::vector& start_positions, + const std::vector& context_lens, + const std::vector& is_prefill, + std::shared_ptr alloc_gpu, + std::shared_ptr alloc_cpu, + void* stream = nullptr); + + private: + void sync_to_device(std::shared_ptr alloc_gpu, + std::shared_ptr alloc_cpu, + void* stream); +}; + +} // namespace base +#endif // KUIPER_INCLUDE_BASE_BATCH_METADATA_H_ \ No newline at end of file diff --git a/kuiper/include/base/block_allocator.h b/kuiper/include/base/block_allocator.h new file mode 100644 index 0000000..66b1331 --- /dev/null +++ b/kuiper/include/base/block_allocator.h @@ -0,0 +1,151 @@ +/** + * @author jintang-coder + * - BlockAllocator: Physical memory block pool management + * + * ============================================================================= + * BlockAllocator 物理块分配器 + * ============================================================================= + * + * 功能: 管理GPU显存中的物理块池,类似操作系统的页帧分配器 + * + * 核心数据结构: + * class BlockAllocator { + * int total_blocks_; // 总块数 + * int block_size_; // 每块可存储的token数 + * std::vector free_blocks_; // 空闲块ID栈 + * }; + * + * --- + * 具体例子: 初始化与分配过程 + * + * 假设配置: + * total_blocks = 100 + * block_size = 16 (每块存16个token的KV) + * + * 初始化后: + * free_blocks_ = [99, 98, 97, ..., 2, 1, 0] // 栈结构,后进先出 + * + * --- + * 分配场景1: Prefill阶段 - 一次分配多块 + * + * // 序列有50个token,需要 ceil(50/16) = 4 个blocks + * std::vector blocks = allocator.allocate(4); + * // 返回 blocks = [0, 1, 2, 3] + * // free_blocks_ = [99, 98, ..., 5, 4] + * + * --- + * 分配场景2: Decode阶段 - 单块分配 + * + * // 当前token位置超出已分配块,需要新块 + * int new_block = allocator.allocate(); + * // 返回 new_block = 4 + * // free_blocks_ = [99, 98, ..., 5] + * + * --- + * 释放场景: 序列完成后归还 + * + * allocator.free({0, 1, 2, 3, 4}); + * // free_blocks_ = [99, 98, ..., 5, 0, 1, 2, 3, 4] + * // 注意: 归还顺序不影响,下次分配从栈顶取 + * + * --- + * 与上层组件的关系: + * + * KVCacheManager + * │ + * │ allocate_blocks_for_tokens(seq_idx, num_tokens) + * │ └─> 计算需要多少blocks + * │ └─> 调用 block_alloc_->allocate() + * ▼ + * BlockAllocator + * │ + * │ allocate() / free() + * ▼ + * free_blocks_ 栈 + * + * --- + * 内存布局示意: + * + * GPU显存 KV Cache Pool: + * ┌─────────┬─────────┬─────────┬─────────┬─────────┬─────────┐ + * │ Block 0 │ Block 1 │ Block 2 │ Block 3 │ Block 4 │ ... │ + * │ (seq0) │ (seq0) │ (seq1) │ (seq1) │ (free) │ │ + * └─────────┴─────────┴─────────┴─────────┴─────────┴─────────┘ + * + * 每个Block内部: + * [num_layers][2(K/V)][block_size][kv_dim] + * + */ + +#ifndef KUIPER_BASE_BLOCK_ALLOCATOR_H +#define KUIPER_BASE_BLOCK_ALLOCATOR_H + +#include +#include +#include + + +namespace base { + +// 显存物理逻辑块管理器 +class BlockAllocator { +public: + // 初始化:构建空闲块链表(栈) + explicit BlockAllocator(int total_blocks, int block_size) + : total_blocks_(total_blocks), block_size_(block_size) { + free_blocks_.reserve(total_blocks); + for (int i = 0; i < total_blocks; ++i) { + // 初始时所有块都是空闲的 + free_blocks_.push_back(total_blocks - 1 - i); + } + } + + // 分配 n 个块,成功返回 block_ids,失败返回空 + std::vector allocate(int num_blocks) { + std::lock_guard lock(mutex_); + if (free_blocks_.size() < num_blocks) { + return {}; // OOM (Out of Memory) + } + std::vector allocated_blocks; + for (int i = 0; i < num_blocks; ++i) { + allocated_blocks.push_back(free_blocks_.back()); + free_blocks_.pop_back(); + } + return allocated_blocks; + } + + // 分配单个块(用于 Decode 阶段) + int allocate() { + std::lock_guard lock(mutex_); + if (free_blocks_.empty()) { + return -1; // OOM + } + int block_id = free_blocks_.back(); + free_blocks_.pop_back(); + return block_id; + } + + // 释放块,归还到池中 + void free(const std::vector& blocks) { + std::lock_guard lock(mutex_); + for (int block_id : blocks) { + free_blocks_.push_back(block_id); + } + } + + int get_free_block_num() const { + return free_blocks_.size(); + } + + int block_size() const { return block_size_; } + +private: + int total_blocks_; + int block_size_; + std::vector free_blocks_; // 空闲块 ID 栈 + std::mutex mutex_; +}; + +} // namespace base + +#endif // KUIPER_BASE_BLOCK_ALLOCATOR_H \ No newline at end of file diff --git a/kuiper/include/base/kv_cache_manager.h b/kuiper/include/base/kv_cache_manager.h new file mode 100644 index 0000000..353a573 --- /dev/null +++ b/kuiper/include/base/kv_cache_manager.h @@ -0,0 +1,114 @@ +#ifndef KUIPER_BASE_KV_CACHE_MANAGER_H +#define KUIPER_BASE_KV_CACHE_MANAGER_H + +#include +#include +#include +#include "base/alloc.h" +#include "base/block_allocator.h" +#include "base/buffer.h" +#include "tensor/tensor.h" + +namespace base { + +class KVCacheManager { +public: + base::Status init_paged_token_major( + base::DeviceType device_type, + std::shared_ptr alloc_dev, + std::shared_ptr alloc_cpu, + int32_t num_seqs, + int32_t num_layers, + int32_t num_kv_heads, + int32_t head_dim, + int32_t block_size, + int32_t max_blocks, + int32_t max_blocks_per_seq, + void* stream = nullptr); + + std::pair slice_kv( + int32_t seq_idx, int32_t layer_idx, int32_t token_pos); + + const tensor::Tensor& kv_cache_device() const; + const tensor::Tensor& block_tables_device(); // 确保已同步到 device + const tensor::Tensor& context_lens_device(); // 确保已同步到 device + + const tensor::Tensor& block_tables_host() const; + const tensor::Tensor& context_lens_host() const; + + void sync_tables_to_device(void* stream = nullptr); // host->device + + void reset_sequence(int32_t seq_idx); // 释放该 seq 的所有 blocks,block_tables=-1, context_len=0 + void reset_all(); + + // Getters for configuration + int32_t block_size() const; + int32_t max_blocks_per_seq() const; + int32_t kv_dim() const; + int32_t num_layers() const; + int32_t num_kv_heads() const; + int32_t head_dim() const; + int32_t num_seqs() const; + int32_t max_blocks() const; + + // Get context length for a sequence + int32_t get_context_len(int32_t seq_idx) const; + + // Set context length directly (useful after prefill) + void set_context_len(int32_t seq_idx, int32_t context_len); + + // Allocate blocks for multiple tokens at once (for prefill) + // Returns true if allocation succeeded + bool allocate_blocks_for_tokens(int32_t seq_idx, int32_t num_tokens); + + // Sequence slot management (for Scheduler) + // Allocate a free sequence slot, returns slot index or -1 if none available + int32_t allocate_sequence_slot(); + + // Release a sequence slot (frees blocks and resets state) + void release_sequence_slot(int32_t seq_idx); + + // Check if there are free sequence slots + bool has_free_slot() const; + + // Get number of free sequence slots + int32_t num_free_slots() const; + + // Get number of free blocks (for memory pressure check) + int32_t num_free_blocks() const; + +private: + float* key_ptr_(int32_t physical_block_id, int32_t layer_idx, int32_t token_in_block); + float* value_ptr_(int32_t physical_block_id, int32_t layer_idx, int32_t token_in_block); + + int32_t ensure_physical_block_(int32_t seq_idx, int32_t logical_block); + + // Internal sync without locking (caller must hold mu_) + void sync_tables_to_device_unlocked_(void* stream); + +private: + mutable std::mutex mu_; + base::DeviceType device_type_{base::DeviceType::kDeviceUnknown}; + + int32_t num_seqs_{0}, num_layers_{0}, num_kv_heads_{0}, head_dim_{0}; + int32_t kv_dim_{0}; + int32_t block_size_{0}, max_blocks_{0}, max_blocks_per_seq_{0}; + + std::shared_ptr alloc_dev_; + std::shared_ptr alloc_cpu_; + + tensor::Tensor kv_cache_dev_; // float, device + tensor::Tensor block_tables_host_; // int32, cpu + tensor::Tensor context_lens_host_; // int32, cpu + tensor::Tensor block_tables_dev_; // int32, device mirror + tensor::Tensor context_lens_dev_; // int32, device mirror + + std::unique_ptr block_alloc_; + bool tables_dirty_{true}; + + // Sequence slot management + std::vector free_seq_slots_; // Available sequence slots +}; + +} // namespace base +#endif // KUIPER_BASE_KV_CACHE_MANAGER_H diff --git a/kuiper/include/base/scheduler.h b/kuiper/include/base/scheduler.h new file mode 100644 index 0000000..e3df799 --- /dev/null +++ b/kuiper/include/base/scheduler.h @@ -0,0 +1,124 @@ +#ifndef KUIPER_BASE_SCHEDULER_H +#define KUIPER_BASE_SCHEDULER_H + +#include +#include +#include +#include +#include +#include "base/scheduler_config.h" +#include "base/sequence.h" + +namespace base { + +class KVCacheManager; + +class Scheduler { +public: + explicit Scheduler(const SchedulerConfig& config); + ~Scheduler() = default; + + // Set the KV cache manager (required before scheduling) + void set_kv_cache_manager(KVCacheManager* kv_manager); + + // Add a new request to the waiting queue + // Returns the assigned sequence ID + int32_t add_request(const std::vector& prompt_tokens, + int32_t max_new_tokens = 128, + const std::string& original_text = ""); + + // Main scheduling function - called each iteration + // Returns which sequences to process and how many tokens each + SchedulerOutput schedule(); + + // Update sequence state after forward pass + // next_tokens: one token per scheduled sequence (in order) + void update_after_forward(const SchedulerOutput& output, + const std::vector& next_tokens); + + // Mark a sequence as finished (EOS or external termination) + void finish_sequence(int32_t seq_id); + + // Get sequence by ID (for reading output tokens, etc.) + const Sequence* get_sequence(int32_t seq_id) const; + + // Get mutable sequence by ID + Sequence* get_sequence_mut(int32_t seq_id); + + // Check if all sequences are finished (no waiting, no running) + bool all_finished() const; + + // Check if there's any work to do + bool has_pending_work() const; + + // Get statistics + int32_t num_waiting() const; + int32_t num_running() const; + int32_t num_preempted() const; + int32_t num_finished() const; + + // Get all finished sequences + std::vector get_finished_sequences() const; + + // Get all sequences (for final output) + std::vector get_all_sequences() const; + + // Get config + const SchedulerConfig& config() const { return config_; } + +private: + // Check if we can admit a new sequence from waiting queue + bool can_admit_new_sequence() const; + + // Estimate blocks needed for a sequence + int32_t estimate_blocks_needed(const Sequence& seq, int32_t additional_tokens) const; + + // Schedule decode sequences (priority) + void schedule_decode(SchedulerOutput& output, int32_t& remaining_tokens, int32_t& remaining_seqs); + + // Schedule prefill sequences (or chunks) + void schedule_prefill(SchedulerOutput& output, int32_t& remaining_tokens, int32_t& remaining_seqs); + + // Try to admit new sequences from waiting queue + void try_admit_waiting(SchedulerOutput& output, int32_t& remaining_tokens, int32_t& remaining_seqs); + + // Try to re-admit preempted sequences (priority over waiting) + void try_admit_preempted(SchedulerOutput& output, int32_t& remaining_tokens, int32_t& remaining_seqs); + + // Move sequence from waiting to running + bool admit_sequence(Sequence& seq); + + // Add a sequence to the scheduled output + void add_to_schedule(SchedulerOutput& output, Sequence& seq, + int32_t num_tokens, bool is_prefill); + + // Preemption: evict sequences when memory is tight + // Returns number of sequences evicted + int32_t preempt_sequences(int32_t blocks_needed); + + // Select victim sequence for preemption (lowest priority = least generated tokens) + int32_t select_victim_sequence() const; + + // Check if memory pressure requires preemption + bool need_preemption(int32_t blocks_needed) const; + +private: + SchedulerConfig config_; + KVCacheManager* kv_manager_ = nullptr; + + // Sequence storage + std::unordered_map sequences_; + int32_t next_seq_id_ = 0; + + // Queues (store seq_ids, not Sequence objects) + std::deque waiting_queue_; // FIFO for new requests + std::deque running_queue_; // Active sequences + std::deque preempted_queue_; // Preempted sequences (priority over waiting) + std::vector finished_seqs_; // Completed sequences + + mutable std::mutex mutex_; +}; + +} // namespace base + +#endif // KUIPER_BASE_SCHEDULER_H \ No newline at end of file diff --git a/kuiper/include/base/scheduler_config.h b/kuiper/include/base/scheduler_config.h new file mode 100644 index 0000000..631c8e3 --- /dev/null +++ b/kuiper/include/base/scheduler_config.h @@ -0,0 +1,36 @@ +#ifndef KUIPER_BASE_SCHEDULER_CONFIG_H +#define KUIPER_BASE_SCHEDULER_CONFIG_H + +#include + +namespace base { + +struct SchedulerConfig { + // Maximum number of tokens to process in a single iteration + // This limits GPU memory usage for activations + int32_t max_num_batched_tokens = 512; + + // Maximum number of sequences in a single batch + int32_t max_num_seqs = 256; + + // Maximum sequence length (context + generated) + int32_t max_model_len = 4096; + + // Maximum tokens for a single prefill chunk + // Smaller chunks allow decode requests to interleave + int32_t max_prefill_tokens = 128; + + // Minimum free blocks to keep as buffer + // Prevents OOM during decode phase + int32_t min_free_blocks = 8; + + // Block size for KV cache (must match KVCacheManager) + int32_t block_size = 16; + + // Policy: true mean decode first , false = FCFS + bool decode_priority = true; +}; + +} + +#endif // KUIPER_BASE_SCHEDULER_CONFIG_H diff --git a/kuiper/include/base/sequence.h b/kuiper/include/base/sequence.h new file mode 100644 index 0000000..9acf179 --- /dev/null +++ b/kuiper/include/base/sequence.h @@ -0,0 +1,116 @@ +#ifndef KUIPER_BASE_SEQUENCE_H +#define KUIPER_BASE_SEQUENCE_H + +#include +#include +#include +#include + +namespace base { + +// Sequence lifecycle states +enum class SequenceStatus : uint8_t { + WAITING = 0, // In waiting queue, not started + RUNNING = 1, // Being processed (has KV cache allocated) + PREEMPTED = 2, // Evicted due to memory pressure, can be rescheduled + FINISHED = 3, // Generation complete (EOS or max length) +}; + +// Represents a single inference request/sequence +struct Sequence { + // Unique identifier for this sequence + int32_t seq_id = -1; + + // Slot index in KVCacheManager (may differ from seq_id) + int32_t kv_slot = -1; + + // Current status + SequenceStatus status = SequenceStatus::WAITING; + + // Input prompt tokens + std::vector prompt_tokens; + + // Generated output tokens + std::vector output_tokens; + + // Current position in prefill (0 = not started, prompt_len = prefill done) + int32_t prefill_pos = 0; + + // Next token to feed (for decode phase) + int32_t next_token = -1; + + // Request metadata + int32_t max_new_tokens = 128; + std::string original_text; + + // Preemption tracking + int32_t num_preemptions = 0; // How many times this sequence was preempted + + // Timing (optional, for metrics) + std::chrono::steady_clock::time_point arrival_time; + std::chrono::steady_clock::time_point first_token_time; + + // Total context length = prefill_pos + output_tokens.size() + int32_t context_len() const { + return prefill_pos + static_cast(output_tokens.size()); + } + + // Check if still in prefill phase + bool is_prefill() const { + return prefill_pos < static_cast(prompt_tokens.size()); + } + + // Remaining prefill tokens + int32_t remaining_prefill() const { + return static_cast(prompt_tokens.size()) - prefill_pos; + } + + // Total prompt length + int32_t prompt_len() const { + return static_cast(prompt_tokens.size()); + } + + // Number of generated tokens + int32_t num_generated() const { + return static_cast(output_tokens.size()); + } +}; + +// Information about a sequence scheduled for this iteration +struct ScheduledSequence { + int32_t seq_id; + int32_t kv_slot; + int32_t num_tokens; // Tokens to process this iteration + int32_t start_pos; // Starting position + int32_t context_len; // Context length after processing + bool is_prefill; // true = prefill, false = decode + std::vector tokens; // Actual token IDs to process +}; + +// Output of scheduler.schedule() call +struct SchedulerOutput { + // Sequences scheduled for this iteration + std::vector scheduled_seqs; + + // Sequences preempted this iteration (for logging/metrics) + std::vector preempted_seq_ids; + + // Total tokens in this batch + int32_t num_tokens = 0; + + // Breakdown + int32_t num_prefill_tokens = 0; + int32_t num_decode_tokens = 0; + int32_t num_prefill_seqs = 0; + int32_t num_decode_seqs = 0; + + // True if there's work to do + bool has_work() const { return !scheduled_seqs.empty(); } + + // True if batch contains mixed prefill + decode + bool is_mixed() const { return num_prefill_seqs > 0 && num_decode_seqs > 0; } +}; + +} // namespace base + +#endif // KUIPER_BASE_SEQUENCE_H diff --git a/kuiper/include/model/llama3.h b/kuiper/include/model/llama3.h index fdf8dd7..468bddd 100644 --- a/kuiper/include/model/llama3.h +++ b/kuiper/include/model/llama3.h @@ -43,8 +43,28 @@ class LLama2Model : public Model { base::Status forward(const tensor::Tensor& input, const tensor::Tensor& pos_tensor, int& next) const override; + // Unified forward interface - supports all scenarios + base::Status forward_unified( + const tensor::Tensor& hidden_states, // [num_tokens, dim] + const base::BatchMetadata& batch_meta, + std::vector& next_tokens) const override; + op::EmbeddingOutput embedding(const std::vector& tokens) const override; + // Get cuda config for external use + std::shared_ptr get_cuda_config() const { return cuda_config_; } + + // Initialize batch buffers for multi-sequence processing + void init_batch_buffers(int32_t num_seqs); + + // Batch forward for multi-sequence decode (legacy, calls forward_unified internally) + // inputs: [num_seqs, dim] - one token per sequence + // positions: [num_seqs] - position for each sequence + // outputs: [num_seqs] - next token for each sequence + base::Status forward_batch(const tensor::Tensor& inputs, + const tensor::Tensor& positions, + std::vector& next_tokens) const; + private: void init_mem() override; @@ -58,6 +78,9 @@ class LLama2Model : public Model { void attention_mha(int32_t layer_idx, const tensor::Tensor& pos_tensor) const; + // Paged attention MHA for decode phase + void attention_mha_paged(int32_t layer_idx, const tensor::Tensor& pos_tensor) const; + void attention_rms(int32_t layer_idx, const tensor::Tensor& input) const; void feed_forward(int32_t layer_idx, const tensor::Tensor& input) const; @@ -68,6 +91,9 @@ class LLama2Model : public Model { int32_t post_processing(const tensor::Tensor& pos, bool is_prompt) const override; + // Helper methods for unified forward + void ensure_unified_buffers(int32_t num_tokens) const; + private: std::shared_ptr cuda_config_; std::unique_ptr llama_layers_; diff --git a/kuiper/include/model/model.h b/kuiper/include/model/model.h index 63b8514..f2dfeb6 100644 --- a/kuiper/include/model/model.h +++ b/kuiper/include/model/model.h @@ -3,6 +3,9 @@ #include #include #include +#include +#include "base/base.h" +#include "base/batch_metadata.h" #include "config.h" #include "op/encode.h" #include "op/layer.h" @@ -10,7 +13,8 @@ #include "sampler/argmax_sampler.h" #include "sentencepiece_processor.h" #include "tensor/tensor.h" - +#include "base/kv_cache_manager.h" +#include "base/block_allocator.h" namespace model { class Model { public: @@ -19,12 +23,57 @@ class Model { virtual base::Status init(base::DeviceType device_type) = 0; + // Legacy single-token interface (deprecated, calls forward_unified internally) virtual base::Status predict(const tensor::Tensor& input, const tensor::Tensor& pos_tensor, bool is_prompt, int& next) const = 0; + // Legacy single-token forward (deprecated, calls forward_unified internally) virtual base::Status forward(const tensor::Tensor& input, const tensor::Tensor& pos_tensor, int& next) const = 0; + // ============================================================================ + // Unified forward interface - supports all scenarios: + // - Single token decode + // - Batch decode (multiple sequences, one token each) + // - Chunked prefill (single sequence, multiple tokens) + // - Mixed prefill + decode (continuous batching) + // ============================================================================ + virtual base::Status forward_unified( + const tensor::Tensor& hidden_states, // [num_tokens, dim] + const base::BatchMetadata& batch_meta, + std::vector& next_tokens) const; + + // Convenience methods + base::Status forward_decode( + const std::vector& seq_ids, + const std::vector& positions, + const tensor::Tensor& hidden_states, + std::vector& next_tokens) const; + + base::Status forward_prefill( + int32_t seq_id, + const tensor::Tensor& hidden_states, // [num_tokens, dim] + int32_t start_pos, + int32_t& next_token) const; + + // Chunked prefill: automatically splits long sequences into chunks + base::Status forward_chunked_prefill( + int32_t seq_id, + const tensor::Tensor& hidden_states, // [total_tokens, dim] + int32_t start_pos, + int32_t chunk_size, // 0 = auto-compute + int32_t& next_token) const; + + // Compute optimal chunk size based on available memory + int32_t compute_chunk_size(int32_t total_tokens, int32_t num_active_seqs = 1) const; + + virtual base::Status init_paged_kv_cache(int32_t num_seqs, int32_t block_size, + int32_t max_blocks, + int32_t max_blocks_per_seq = -1); + + // Initialize unified buffers for variable token counts + virtual void init_unified_buffers(int32_t max_tokens); + base::ModelType model_type() const; const std::string& token_path() const; @@ -54,6 +103,12 @@ class Model { const op::EmbeddingOutput& embedding_output, bool is_prompt) const; + // Get model configuration + const TransformerConfig* config() const { return config_.get(); } + + // Get KV cache manager (for paged attention) + base::KVCacheManager* get_kv_cache_manager() const { return kv_cache_manager_.get(); } + protected: virtual base::Status insert_buffer(ModelBufferType buffer_idx, const tensor::Tensor& tensor); @@ -80,6 +135,12 @@ class Model { protected: int32_t group_size_ = 1; + int32_t num_seqs_ = 1; // Number of sequences for batch processing + int32_t max_unified_tokens_ = 0; // Max tokens for unified buffers + mutable std::mutex paged_kv_mutex_; + std::unique_ptr block_allocator_; + std::unique_ptr kv_cache_manager_; + bool is_quant_model_ = false; std::unique_ptr config_; diff --git a/kuiper/include/model/request.h b/kuiper/include/model/request.h new file mode 100644 index 0000000..ec79fba --- /dev/null +++ b/kuiper/include/model/request.h @@ -0,0 +1,46 @@ +#ifndef KUIPER_INCLUDE_MODEL_REQUEST_H_ +#define KUIPER_INCLUDE_MODEL_REQUEST_H_ + +#include +#include + +namespace model { + +// Request structure for managing inference requests with paged attention +struct Request { + int32_t req_id; // Unique request identifier + std::vector tokens; // Token sequence (prompt + generated tokens) + std::vector block_table; // Physical block IDs allocated for this request + int32_t prompt_len; // Length of the original prompt + bool is_finished; // Whether generation is complete + + explicit Request(int32_t id, const std::vector& prompt_tokens) + : req_id(id), + tokens(prompt_tokens), + prompt_len(static_cast(prompt_tokens.size())), + is_finished(false) {} + + // Get current sequence length (prompt + generated tokens) + int32_t get_current_seq_len() const { + return static_cast(tokens.size()); + } + + // Get number of generated tokens (excluding prompt) + int32_t get_generated_len() const { + return get_current_seq_len() - prompt_len; + } + + // Add a newly generated token + void add_token(int32_t token) { + tokens.push_back(token); + } + + // Mark request as finished + void finish() { + is_finished = true; + } +}; + +} // namespace model + +#endif // KUIPER_INCLUDE_MODEL_REQUEST_H_ diff --git a/kuiper/include/tensor/tensor.h b/kuiper/include/tensor/tensor.h index 038f13c..6f591ce 100644 --- a/kuiper/include/tensor/tensor.h +++ b/kuiper/include/tensor/tensor.h @@ -14,7 +14,7 @@ class Tensor { explicit Tensor() = default; explicit Tensor(base::DataType data_type, int32_t dim0, bool need_alloc = false, - std::shared_ptr alloc = nullptr, void* ptr = nullptr); + std::shared_ptr alloc = nullptr, void* ptr = nullptr); explicit Tensor(base::DataType data_type, int32_t dim0, int32_t dim1, bool need_alloc = false, std::shared_ptr alloc = nullptr, void* ptr = nullptr); @@ -30,6 +30,10 @@ class Tensor { explicit Tensor(base::DataType data_type, std::vector dims, bool need_alloc = false, std::shared_ptr alloc = nullptr, void* ptr = nullptr); + // Constructor for large tensors (size > INT32_MAX) + explicit Tensor(base::DataType data_type, size_t size, bool need_alloc = false, + std::shared_ptr alloc = nullptr, void* ptr = nullptr); + void to_cpu(); void to_cuda(cudaStream_t stream = nullptr); diff --git a/kuiper/source/base/batch_metadata.cpp b/kuiper/source/base/batch_metadata.cpp new file mode 100644 index 0000000..bc44c07 --- /dev/null +++ b/kuiper/source/base/batch_metadata.cpp @@ -0,0 +1,264 @@ +/** + + * @author jintang-coder + * - BatchMetadata: Token-to-sequence mapping for GPU + * + */ +// BatchMetadata 中 tokens 的拆解与映射 + +// 核心数据结构 + +// struct BatchMetadata { +// int32_t num_tokens; // 总token数 +// int32_t num_seqs; // 序列数 + +// // 关键映射数组 (长度 = num_tokens) +// tensor::Tensor seq_ids; // [num_tokens] 每个token属于哪个序列 +// tensor::Tensor positions; // [num_tokens] 每个token在序列中的位置 + +// // 序列边界 (长度 = num_seqs + 1) +// tensor::Tensor seq_start_locs; // [num_seqs+1] 每个序列在token数组中的起始位置 +// }; + +// --- +// 具体例子:Mixed Batch (Prefill + Decode 混合) + +// 假设调度器输出: +// Seq0 (kv_slot=0): prefill, 10 tokens, start_pos=0 +// Seq1 (kv_slot=1): decode, 1 token, start_pos=25 +// Seq2 (kv_slot=2): prefill, 5 tokens, start_pos=0 + +// 输入参数 (来自 Scheduler) + +// seq_ids_in = [0, 1, 2] // kv_slot +// token_counts = [10, 1, 5] // 每个序列的token数 +// start_positions = [0, 25, 0] // 每个序列的起始位置 +// context_lens = [10, 26, 5] // 处理后的context长度 +// is_prefill = [true, false, true] + +// create_mixed() 构建过程 + +// // 1. 计算总token数 +// num_tokens = 10 + 1 + 5 = 16 +// num_seqs = 3 + +// // 2. 构建 seq_start_locs (序列边界) +// seq_start_locs = [0, 10, 11, 16] +// // │ │ │ └── 结束位置 +// // │ │ └── seq2 起始 (offset=11) +// // │ └── seq1 起始 (offset=10) +// // └── seq0 起始 (offset=0) + +// // 3. 展开 seq_ids (每个token属于哪个序列) +// seq_ids = [0,0,0,0,0,0,0,0,0,0, 1, 2,2,2,2,2] +// // └──── seq0: 10个 ────┘ │ └─ seq2: 5个 ─┘ +// // seq1: 1个 + +// // 4. 展开 positions (每个token的位置) +// positions = [0,1,2,3,4,5,6,7,8,9, 25, 0,1,2,3,4] +// // └── seq0: pos 0-9 ──┘ │ └─ seq2: pos 0-4 ─┘ +// // seq1: pos 25 + + + + + + + +#include "base/batch_metadata.h" +#include +#include +#include +#include + +namespace base { + +bool BatchMetadata::has_prefill() const { + for (bool is_pf : is_prefill_cpu) { + if (is_pf) return true; + } + return false; +} + +bool BatchMetadata::has_decode() const { + for (bool is_pf : is_prefill_cpu) { + if (!is_pf) return true; + } + return false; +} + +int32_t BatchMetadata::max_prefill_len() const { + int32_t max_len = 0; + for (int32_t i = 0; i < num_seqs; ++i) { + if (is_prefill_cpu[i]) { + int32_t len = seq_start_locs_cpu[i + 1] - seq_start_locs_cpu[i]; + max_len = std::max(max_len, len); + } + } + return max_len; +} + +int32_t BatchMetadata::max_context_len() const { + if (seq_lens_cpu.empty()) return 0; + return *std::max_element(seq_lens_cpu.begin(), seq_lens_cpu.end()); +} + +int32_t BatchMetadata::get_num_prefill_tokens() const { + int32_t count = 0; + for (int32_t i = 0; i < num_seqs; ++i) { + if (is_prefill_cpu[i]) { + count += seq_start_locs_cpu[i + 1] - seq_start_locs_cpu[i]; + } + } + return count; +} + +int32_t BatchMetadata::get_num_decode_tokens() const { + int32_t count = 0; + for (int32_t i = 0; i < num_seqs; ++i) { + if (!is_prefill_cpu[i]) { + count += seq_start_locs_cpu[i + 1] - seq_start_locs_cpu[i]; + } + } + return count; +} + +std::vector BatchMetadata::get_last_token_indices() const { + std::vector indices(num_seqs); + for (int32_t i = 0; i < num_seqs; ++i) { + indices[i] = seq_start_locs_cpu[i + 1] - 1; + } + return indices; +} + +void BatchMetadata::sync_to_device(std::shared_ptr alloc_gpu, + std::shared_ptr alloc_cpu, + void* stream) { + // Allocate GPU tensors + seq_ids = tensor::Tensor(DataType::kDataTypeInt32, num_tokens, true, alloc_gpu); + positions = tensor::Tensor(DataType::kDataTypeInt32, num_tokens, true, alloc_gpu); + seq_start_locs = tensor::Tensor(DataType::kDataTypeInt32, num_seqs + 1, true, alloc_gpu); + + seq_ids.set_device_type(DeviceType::kDeviceCUDA); + positions.set_device_type(DeviceType::kDeviceCUDA); + seq_start_locs.set_device_type(DeviceType::kDeviceCUDA); + + // Copy data to GPU + cudaStream_t cuda_stream = static_cast(stream); + + cudaMemcpyAsync(seq_ids.ptr(), seq_ids_cpu.data(), + num_tokens * sizeof(int32_t), cudaMemcpyHostToDevice, cuda_stream); + + cudaMemcpyAsync(positions.ptr(), positions_cpu.data(), + num_tokens * sizeof(int32_t), cudaMemcpyHostToDevice, cuda_stream); + + cudaMemcpyAsync(seq_start_locs.ptr(), seq_start_locs_cpu.data(), + (num_seqs + 1) * sizeof(int32_t), cudaMemcpyHostToDevice, cuda_stream); +} + +BatchMetadata BatchMetadata::create_decode( + const std::vector& seq_ids_in, + const std::vector& positions_in, + const std::vector& context_lens, + std::shared_ptr alloc_gpu, + std::shared_ptr alloc_cpu, + void* stream) { + CHECK_EQ(seq_ids_in.size(), positions_in.size()); + CHECK_EQ(seq_ids_in.size(), context_lens.size()); + + BatchMetadata meta; + meta.num_tokens = static_cast(seq_ids_in.size()); + meta.num_seqs = static_cast(seq_ids_in.size()); + + // For decode, each sequence has exactly 1 token + meta.seq_ids_cpu = seq_ids_in; + meta.positions_cpu = positions_in; + meta.seq_lens_cpu = context_lens; + + // Build seq_start_locs: [0, 1, 2, ..., num_seqs] + meta.seq_start_locs_cpu.resize(meta.num_seqs + 1); + for (int32_t i = 0; i <= meta.num_seqs; ++i) { + meta.seq_start_locs_cpu[i] = i; + } + + // All sequences are in decode phase + meta.is_prefill_cpu.resize(meta.num_seqs, false); + + meta.sync_to_device(alloc_gpu, alloc_cpu, stream); + return meta; +} + +BatchMetadata BatchMetadata::create_prefill( + int32_t seq_id, + int32_t start_pos, + int32_t num_tokens, + int32_t context_len, + std::shared_ptr alloc_gpu, + std::shared_ptr alloc_cpu, + void* stream) { + CHECK_GT(num_tokens, 0); + + BatchMetadata meta; + meta.num_tokens = num_tokens; + meta.num_seqs = 1; + + // All tokens belong to the same sequence + meta.seq_ids_cpu.resize(num_tokens, seq_id); + + // Positions: [start_pos, start_pos+1, ..., start_pos+num_tokens-1] + meta.positions_cpu.resize(num_tokens); + for (int32_t i = 0; i < num_tokens; ++i) { + meta.positions_cpu[i] = start_pos + i; + } + + meta.seq_start_locs_cpu = {0, num_tokens}; + meta.seq_lens_cpu = {context_len}; + meta.is_prefill_cpu = {true}; + + meta.sync_to_device(alloc_gpu, alloc_cpu, stream); + return meta; +} + +BatchMetadata BatchMetadata::create_mixed( + const std::vector& seq_ids_in, + const std::vector& token_counts, + const std::vector& start_positions, + const std::vector& context_lens, + const std::vector& is_prefill, + std::shared_ptr alloc_gpu, + std::shared_ptr alloc_cpu, + void* stream) { + int32_t num_seqs = static_cast(seq_ids_in.size()); + CHECK_EQ(token_counts.size(), num_seqs); + CHECK_EQ(start_positions.size(), num_seqs); + CHECK_EQ(context_lens.size(), num_seqs); + CHECK_EQ(is_prefill.size(), num_seqs); + + BatchMetadata meta; + meta.num_seqs = num_seqs; + meta.num_tokens = std::accumulate(token_counts.begin(), token_counts.end(), 0); + + meta.seq_ids_cpu.reserve(meta.num_tokens); + meta.positions_cpu.reserve(meta.num_tokens); + meta.seq_start_locs_cpu.resize(num_seqs + 1); + meta.seq_lens_cpu = context_lens; + meta.is_prefill_cpu = is_prefill; + + int32_t offset = 0; + for (int32_t i = 0; i < num_seqs; ++i) { + meta.seq_start_locs_cpu[i] = offset; + + for (int32_t t = 0; t < token_counts[i]; ++t) { + meta.seq_ids_cpu.push_back(seq_ids_in[i]); + meta.positions_cpu.push_back(start_positions[i] + t); + } + + offset += token_counts[i]; + } + meta.seq_start_locs_cpu[num_seqs] = offset; + + meta.sync_to_device(alloc_gpu, alloc_cpu, stream); + return meta; +} + +} // namespace base \ No newline at end of file diff --git a/kuiper/source/base/kv_cache_manager.cpp b/kuiper/source/base/kv_cache_manager.cpp new file mode 100644 index 0000000..1695c2f --- /dev/null +++ b/kuiper/source/base/kv_cache_manager.cpp @@ -0,0 +1,546 @@ +/** + * @author jintang-coder + * - KVCacheManager: KV Cache memory and sequence slot management + * + * ============================================================================= + * KVCacheManager KV缓存管理器 + * ============================================================================= + * + * 功能: 管理序列槽位(slot)分配 + 物理块(block)映射 + KV Cache读写 + * + * 核心数据结构: + * class KVCacheManager { + * tensor::Tensor kv_cache_dev_; // GPU上的KV缓存池 + * tensor::Tensor block_tables_host_; // [num_seqs, max_blocks_per_seq] 逻辑块→物理块映射 + * tensor::Tensor context_lens_host_; // [num_seqs] 每个序列的当前长度 + * std::vector free_seq_slots_; // 空闲序列槽位 + * BlockAllocator block_alloc_; // 物理块分配器 + * }; + * + * --- + * 具体例子: 3个序列的完整生命周期 + * + * 配置: + * num_seqs = 4, block_size = 16, max_blocks = 100 + * + * 初始状态: + * free_seq_slots_ = [3, 2, 1, 0] + * block_tables_ = [[-1,-1,...], [-1,-1,...], [-1,-1,...], [-1,-1,...]] + * context_lens_ = [0, 0, 0, 0] + * + * --- + * 步骤1: 分配序列槽位 + * + * int slot = kv_manager->allocate_sequence_slot(); + * // slot = 0 + * // free_seq_slots_ = [3, 2, 1] + * + * --- + * 步骤2: 为Prefill分配blocks (假设prompt有50个tokens) + * + * kv_manager->allocate_blocks_for_tokens(slot=0, num_tokens=50); + * // 需要 ceil(50/16) = 4 个blocks + * // 调用 block_alloc_->allocate() 4次,获得 blocks [0,1,2,3] + * + * block_tables_[0] = [0, 1, 2, 3, -1, -1, ...] + * // │ │ │ │ + * // │ │ │ └─ logical_block_3 → physical_block_3 + * // │ │ └─ logical_block_2 → physical_block_2 + * // │ └─ logical_block_1 → physical_block_1 + * // └─ logical_block_0 → physical_block_0 + * + * context_lens_[0] = 50 + * + * --- + * 步骤3: Decode阶段 - 生成新token + * + * // token位置从50增长到64,仍在block_3内 (16*4=64) + * // 无需新块 + * + * // token位置增长到65,需要新块 + * kv_manager->allocate_blocks_for_tokens(slot=0, num_tokens=65); + * // logical_block_4 需要分配 → physical_block_4 + * block_tables_[0] = [0, 1, 2, 3, 4, -1, ...] + * + * --- + * 步骤4: 序列完成 - 释放资源 + * + * kv_manager->release_sequence_slot(slot=0); + * // 1. 收集 blocks [0,1,2,3,4] + * // 2. 调用 block_alloc_->free([0,1,2,3,4]) + * // 3. block_tables_[0] = [-1,-1,-1,...] + * // 4. context_lens_[0] = 0 + * // 5. free_seq_slots_.push_back(0) → [3,2,1,0] + * + * --- + * KV Cache 内存布局: + * + * kv_cache_dev_ shape: [max_blocks, num_layers, 2, block_size, kv_dim] + * + * 访问 seq0, layer2, token位置35 的 Key: + * logical_block = 35 / 16 = 2 + * token_in_block = 35 % 16 = 3 + * physical_block = block_tables_[0][2] = 2 + * + * offset = physical_block * (num_layers * 2 * block_size * kv_dim) + * + layer_idx * (2 * block_size * kv_dim) + * + 0 * (block_size * kv_dim) // K plane + * + token_in_block * kv_dim + * + * --- + * 与Scheduler的交互: + * + * Scheduler KVCacheManager + * │ │ + * │ admit_sequence() │ + * │ └─> allocate_sequence_slot() ───►│ 返回 slot + * │ └─> allocate_blocks_for_tokens()►│ 分配物理块 + * │ │ + * │ finish_sequence() │ + * │ └─> release_sequence_slot() ────►│ 释放slot+blocks + * │ │ + * + */ + +#include "base/kv_cache_manager.h" +#include +#include +#include "tensor/tensor.h" + +namespace base { + +base::Status KVCacheManager::init_paged_token_major( + base::DeviceType device_type, + std::shared_ptr alloc_dev, + std::shared_ptr alloc_cpu, + int32_t num_seqs, + int32_t num_layers, + int32_t num_kv_heads, + int32_t head_dim, + int32_t block_size, + int32_t max_blocks, + int32_t max_blocks_per_seq, + void* stream) { + + std::lock_guard lock(mu_); + + device_type_ = device_type; + num_seqs_ = num_seqs; + num_layers_ = num_layers; + num_kv_heads_ = num_kv_heads; + head_dim_ = head_dim; + kv_dim_ = num_kv_heads * head_dim; + block_size_ = block_size; + max_blocks_ = max_blocks; + max_blocks_per_seq_ = max_blocks_per_seq; + + alloc_dev_ = alloc_dev; + alloc_cpu_ = alloc_cpu; + + // Initialize block allocator + block_alloc_ = std::make_unique (max_blocks, block_size); + + // Allocate KV cache on device + // Layout: [max_blocks, num_layers, 2 (K+V), block_size, kv_dim] + size_t kv_cache_size = static_cast(max_blocks) * num_layers * 2 * block_size * kv_dim_; + kv_cache_dev_ = tensor::Tensor(base::DataType::kDataTypeFp32, kv_cache_size); + kv_cache_dev_.allocate(alloc_dev_); + kv_cache_dev_.set_device_type(device_type); + + // Initialize to zero + if (device_type == base::DeviceType::kDeviceCUDA) { + cudaMemsetAsync(kv_cache_dev_.ptr(), 0, kv_cache_size * sizeof(float), + static_cast(stream)); + } else { + memset(kv_cache_dev_.ptr(), 0, kv_cache_size * sizeof(float)); + } + + // Allocate block tables on host and device + // Layout: [num_seqs, max_blocks_per_seq] + int32_t block_table_size = num_seqs * max_blocks_per_seq; + block_tables_host_ = tensor::Tensor(base::DataType::kDataTypeInt32, block_table_size); + block_tables_host_.allocate(alloc_cpu_); + block_tables_host_.set_device_type(base::DeviceType::kDeviceCPU); + + block_tables_dev_ = tensor::Tensor(base::DataType::kDataTypeInt32, block_table_size); + block_tables_dev_.allocate(alloc_dev_); + block_tables_dev_.set_device_type(device_type); + + // Initialize block tables to -1 (invalid) + int32_t* block_table_ptr = block_tables_host_.ptr(); + for (int32_t i = 0; i < block_table_size; ++i) { + block_table_ptr[i] = -1; + } + + // Allocate context lengths on host and device + // Layout: [num_seqs] + context_lens_host_ = tensor::Tensor(base::DataType::kDataTypeInt32, num_seqs); + context_lens_host_.allocate(alloc_cpu_); + context_lens_host_.set_device_type(base::DeviceType::kDeviceCPU); + + context_lens_dev_ = tensor::Tensor(base::DataType::kDataTypeInt32, num_seqs); + context_lens_dev_.allocate(alloc_dev_); + context_lens_dev_.set_device_type(device_type); + // Initialize context lengths to 0 + int32_t* context_lens_ptr = context_lens_host_.ptr(); + for (int32_t i = 0; i < num_seqs; ++i) { + context_lens_ptr[i] = 0; + } + + // Initialize free sequence slots (reverse order so we allocate 0, 1, 2, ...) + free_seq_slots_.clear(); + free_seq_slots_.reserve(num_seqs); + for (int32_t i = num_seqs - 1; i >= 0; --i) { + free_seq_slots_.push_back(i); + } + + tables_dirty_ = true; + + return base::error::Success(); +} + +std::pair KVCacheManager::slice_kv( + int32_t seq_idx, int32_t layer_idx, int32_t token_pos) { + + std::lock_guard lock(mu_); + + CHECK_LT(seq_idx, num_seqs_) << "Sequence index out of bounds"; + CHECK_LT(layer_idx, num_layers_) << "Layer index out of bounds"; + + // Determine which logical block this token belongs to + int32_t logical_block = token_pos / block_size_; + int32_t token_in_block = token_pos % block_size_; + + // Get or allocate physical block + int32_t physical_block_id = ensure_physical_block_(seq_idx, logical_block); + CHECK_GE(physical_block_id, 0) << "Failed to allocate physical block"; + + // Update context length if needed + int32_t* context_lens_ptr = context_lens_host_.ptr(); + if (token_pos >= context_lens_ptr[seq_idx]) { + context_lens_ptr[seq_idx] = token_pos + 1; + tables_dirty_ = true; + } + + // Calculate pointers to K and V + float* key_ptr = key_ptr_(physical_block_id, layer_idx, token_in_block); + float* value_ptr = value_ptr_(physical_block_id, layer_idx, token_in_block); + + // Create tensor views (non-owning) + tensor::Tensor key(base::DataType::kDataTypeFp32, kv_dim_, false, nullptr, key_ptr); + tensor::Tensor val(base::DataType::kDataTypeFp32, kv_dim_, false, nullptr, value_ptr); + + key.set_device_type(device_type_); + val.set_device_type(device_type_); + + return {key, val}; +} + +const tensor::Tensor& KVCacheManager::kv_cache_device() const { + return kv_cache_dev_; +} + +const tensor::Tensor& KVCacheManager::block_tables_device() { + std::lock_guard lock(mu_); + if (tables_dirty_) { + sync_tables_to_device_unlocked_(nullptr); + } + return block_tables_dev_; +} + +const tensor::Tensor& KVCacheManager::context_lens_device() { + std::lock_guard lock(mu_); + if (tables_dirty_) { + sync_tables_to_device_unlocked_(nullptr); + } + return context_lens_dev_; +} + +const tensor::Tensor& KVCacheManager::block_tables_host() const { + return block_tables_host_; +} + +const tensor::Tensor& KVCacheManager::context_lens_host() const { + return context_lens_host_; +} + +void KVCacheManager::sync_tables_to_device(void* stream) { + std::lock_guard lock(mu_); + sync_tables_to_device_unlocked_(stream); +} + +void KVCacheManager::sync_tables_to_device_unlocked_(void* stream) { + if (!tables_dirty_) { + return; + } + + + // Copy block tables + if (device_type_ == base::DeviceType::kDeviceCUDA) { + cudaMemcpyAsync(block_tables_dev_.ptr(), + block_tables_host_.ptr(), + block_tables_host_.size() * sizeof(int32_t), + cudaMemcpyHostToDevice, + static_cast(stream)); + + cudaMemcpyAsync(context_lens_dev_.ptr(), + context_lens_host_.ptr(), + context_lens_host_.size() * sizeof(int32_t), + cudaMemcpyHostToDevice, + static_cast(stream)); + } else { + memcpy(block_tables_dev_.ptr(), + block_tables_host_.ptr(), + block_tables_host_.size() * sizeof(int32_t)); + + memcpy(context_lens_dev_.ptr(), + context_lens_host_.ptr(), + context_lens_host_.size() * sizeof(int32_t)); + } + + tables_dirty_ = false; +} + +void KVCacheManager::reset_sequence(int32_t seq_idx) { + std::lock_guard lock(mu_); + + CHECK_LT(seq_idx, num_seqs_) << "Sequence index out of bounds"; + + // Free all blocks for this sequence + std::vector blocks_to_free; + int32_t* block_table_ptr = block_tables_host_.ptr(); + + for (int32_t i = 0; i < max_blocks_per_seq_; ++i) { + int32_t block_id = block_table_ptr[seq_idx * max_blocks_per_seq_ + i]; + if (block_id >= 0) { + blocks_to_free.push_back(block_id); + block_table_ptr[seq_idx * max_blocks_per_seq_ + i] = -1; + } + } + + if (!blocks_to_free.empty()) { + block_alloc_->free(blocks_to_free); + } + + // Reset context length + int32_t* context_lens_ptr = context_lens_host_.ptr(); + context_lens_ptr[seq_idx] = 0; + + tables_dirty_ = true; +} + +void KVCacheManager::reset_all() { + std::lock_guard lock(mu_); + + // Reset all sequences + int32_t* block_table_ptr = block_tables_host_.ptr(); + for (int32_t i = 0; i < num_seqs_ * max_blocks_per_seq_; ++i) { + block_table_ptr[i] = -1; + } + + int32_t* context_lens_ptr = context_lens_host_.ptr(); + for (int32_t i = 0; i < num_seqs_; ++i) { + context_lens_ptr[i] = 0; + } + + // Reinitialize block allocator + block_alloc_ = std::make_unique(max_blocks_, block_size_); + + tables_dirty_ = true; +} + +int32_t KVCacheManager::block_size() const { + return block_size_; +} + +int32_t KVCacheManager::max_blocks_per_seq() const { + return max_blocks_per_seq_; +} + +int32_t KVCacheManager::kv_dim() const { + return kv_dim_; +} + +int32_t KVCacheManager::num_layers() const { + return num_layers_; +} + +int32_t KVCacheManager::num_kv_heads() const { + return num_kv_heads_; +} + +int32_t KVCacheManager::head_dim() const { + return head_dim_; +} + +int32_t KVCacheManager::num_seqs() const { + return num_seqs_; +} + +int32_t KVCacheManager::max_blocks() const { + return max_blocks_; +} + +int32_t KVCacheManager::get_context_len(int32_t seq_idx) const { + std::lock_guard lock(mu_); + CHECK_LT(seq_idx, num_seqs_) << "Sequence index out of bounds"; + return context_lens_host_.ptr()[seq_idx]; +} + +void KVCacheManager::set_context_len(int32_t seq_idx, int32_t context_len) { + std::lock_guard lock(mu_); + CHECK_LT(seq_idx, num_seqs_) << "Sequence index out of bounds"; + CHECK_GE(context_len, 0) << "Context length must be non-negative"; + + int32_t* context_lens_ptr = context_lens_host_.ptr(); + context_lens_ptr[seq_idx] = context_len; + tables_dirty_ = true; +} + +bool KVCacheManager::allocate_blocks_for_tokens(int32_t seq_idx, int32_t num_tokens) { + std::lock_guard lock(mu_); + CHECK_LT(seq_idx, num_seqs_) << "Sequence index out of bounds"; + + if (num_tokens <= 0) { + return true; + } + + // Calculate how many blocks we need + int32_t num_blocks_needed = (num_tokens + block_size_ - 1) / block_size_; + + if (num_blocks_needed > max_blocks_per_seq_) { + LOG(ERROR) << "Requested " << num_blocks_needed << " blocks, but max_blocks_per_seq is " + << max_blocks_per_seq_; + return false; + } + + // Check current allocation and allocate missing blocks + int32_t* block_table_ptr = block_tables_host_.ptr(); + + for (int32_t logical_block = 0; logical_block < num_blocks_needed; ++logical_block) { + int32_t table_idx = seq_idx * max_blocks_per_seq_ + logical_block; + if (block_table_ptr[table_idx] < 0) { + // Need to allocate this block + int32_t physical_block_id = block_alloc_->allocate(); + if (physical_block_id < 0) { + LOG(ERROR) << "Out of memory: cannot allocate block " << logical_block; + return false; + } + block_table_ptr[table_idx] = physical_block_id; + tables_dirty_ = true; + } + } + + // Update context length + int32_t* context_lens_ptr = context_lens_host_.ptr(); + if (num_tokens > context_lens_ptr[seq_idx]) { + context_lens_ptr[seq_idx] = num_tokens; + tables_dirty_ = true; + } + + return true; +} + + +// Private helper methods + + float* KVCacheManager::key_ptr_(int32_t physical_block_id, int32_t layer_idx, + int32_t token_in_block) { + // Layout: [max_blocks, num_layers, 2 (K+V), block_size, kv_dim] + size_t offset = static_cast(physical_block_id) * num_layers_ * 2 * block_size_ * kv_dim_ + + static_cast(layer_idx) * 2 * block_size_ * kv_dim_ + + 0 * block_size_ * kv_dim_ // K plane (index 0) + + static_cast(token_in_block) * kv_dim_; + + return (kv_cache_dev_.ptr()) + offset; +} + + float* KVCacheManager::value_ptr_(int32_t physical_block_id, int32_t layer_idx, + int32_t token_in_block) { + // Layout: [max_blocks, num_layers, 2 (K+V), block_size, kv_dim] + size_t offset = static_cast(physical_block_id) * num_layers_ * 2 * block_size_ * kv_dim_ + + static_cast(layer_idx) * 2 * block_size_ * kv_dim_ + + 1 * block_size_ * kv_dim_ // V plane (index 1) + + static_cast(token_in_block) * kv_dim_; + + return (kv_cache_dev_.ptr()) + offset; +} + +int32_t KVCacheManager::ensure_physical_block_(int32_t seq_idx, int32_t logical_block) { + CHECK_LT(logical_block, max_blocks_per_seq_) << "Logical block index out of bounds"; + + int32_t* block_table_ptr = block_tables_host_.ptr(); + int32_t table_idx = seq_idx * max_blocks_per_seq_ + logical_block; + + int32_t physical_block_id = block_table_ptr[table_idx]; + + // If not allocated, allocate a new block + if (physical_block_id < 0) { + physical_block_id = block_alloc_->allocate(); + CHECK_GE(physical_block_id, 0) << "Out of memory: cannot allocate new block"; + + block_table_ptr[table_idx] = physical_block_id; + tables_dirty_ = true; + } + + return physical_block_id; +} + +int32_t KVCacheManager::allocate_sequence_slot() { + std::lock_guard lock(mu_); + if (free_seq_slots_.empty()) { + return -1; + } + int32_t slot = free_seq_slots_.back(); + free_seq_slots_.pop_back(); + return slot; +} + +void KVCacheManager::release_sequence_slot(int32_t seq_idx) { + std::lock_guard lock(mu_); + if (seq_idx < 0 || seq_idx >= num_seqs_) { + return; + } + + // Free all blocks for this sequence + std::vector blocks_to_free; + int32_t* block_table_ptr = block_tables_host_.ptr(); + + for (int32_t i = 0; i < max_blocks_per_seq_; ++i) { + int32_t block_id = block_table_ptr[seq_idx * max_blocks_per_seq_ + i]; + if (block_id >= 0) { + blocks_to_free.push_back(block_id); + block_table_ptr[seq_idx * max_blocks_per_seq_ + i] = -1; + } + } + + if (!blocks_to_free.empty()) { + block_alloc_->free(blocks_to_free); + } + + // Reset context length + int32_t* context_lens_ptr = context_lens_host_.ptr(); + context_lens_ptr[seq_idx] = 0; + + tables_dirty_ = true; + + // Return slot to free pool + free_seq_slots_.push_back(seq_idx); +} + +bool KVCacheManager::has_free_slot() const { + std::lock_guard lock(mu_); + return !free_seq_slots_.empty(); +} + +int32_t KVCacheManager::num_free_slots() const { + std::lock_guard lock(mu_); + return static_cast(free_seq_slots_.size()); +} + +int32_t KVCacheManager::num_free_blocks() const { + std::lock_guard lock(mu_); + return block_alloc_->get_free_block_num(); +} + +} // namespace base diff --git a/kuiper/source/base/scheduler.cpp b/kuiper/source/base/scheduler.cpp new file mode 100644 index 0000000..41b8e8d --- /dev/null +++ b/kuiper/source/base/scheduler.cpp @@ -0,0 +1,738 @@ +/** + * @author jintang-coder + * - Scheduler: Request scheduling and continuous batching + * + * ============================================================================= + * Scheduler 调度器 + * ============================================================================= + * + * 功能: 管理推理请求队列,决定每次迭代处理哪些序列的哪些tokens + * + * 核心数据结构: + * class Scheduler { + * std::deque waiting_queue_; // 等待队列 (新请求) + * std::deque running_queue_; // 运行队列 (已分配KV slot) + * std::vector finished_seqs_; // 已完成序列 + * std::unordered_map sequences_; // 序列详情 + * KVCacheManager* kv_manager_; // KV缓存管理器 + * }; + * + * --- + * 具体例子: 3个请求的调度过程 + * + * 请求到达: + * Seq0: "Hello world" (10 tokens), max_new_tokens=50 + * Seq1: "Once upon a time" (20 tokens), max_new_tokens=50 + * Seq2: "The quick brown fox" (15 tokens), max_new_tokens=50 + * + * 配置: + * max_num_batched_tokens = 32 + * max_prefill_tokens = 16 (chunked prefill) + * decode_priority = true + * + * --- + * 迭代1: schedule() - 首次调度 + * + * waiting_queue_ = [0, 1, 2] + * running_queue_ = [] + * + * 1. schedule_decode() - 无decode序列,跳过 + * 2. schedule_prefill() - 无running序列,跳过 + * 3. try_admit_waiting() - 从waiting移入running + * + * remaining_tokens = 32 + * + * admit_sequence(seq0): + * kv_slot = kv_manager_->allocate_sequence_slot() → 0 + * chunk_size = min(32, 16, 10) = 10 + * remaining_tokens = 32 - 10 = 22 + * + * admit_sequence(seq1): + * kv_slot = kv_manager_->allocate_sequence_slot() → 1 + * chunk_size = min(22, 16, 20) = 16 + * remaining_tokens = 22 - 16 = 6 + * + * admit_sequence(seq2): + * kv_slot = kv_manager_->allocate_sequence_slot() → 2 + * chunk_size = min(6, 16, 15) = 6 // 用完剩余token额度 + * remaining_tokens = 6 - 6 = 0 + * + * 输出 SchedulerOutput: + * scheduled_seqs = [ + * {seq_id=0, kv_slot=0, num_tokens=10, start_pos=0, is_prefill=true}, + * {seq_id=1, kv_slot=1, num_tokens=16, start_pos=0, is_prefill=true}, + * {seq_id=2, kv_slot=2, num_tokens=6, start_pos=0, is_prefill=true}, + * ] + * num_tokens = 32, num_prefill_tokens = 32 + * + * 状态变化: + * waiting_queue_ = [] // 全部admit + * running_queue_ = [0, 1, 2] + * seq0.prefill_pos = 0 → 10 (prefill完成) + * seq1.prefill_pos = 0 → 16 (部分prefill, 还剩4个) + * seq2.prefill_pos = 0 → 6 (部分prefill, 还剩9个) + * + * --- + * 迭代2: schedule() - 混合批次 (Prefill续 + Decode) + * + * running_queue_ = [0, 1, 2] + * remaining_tokens = 32 + * + * 1. schedule_decode() - seq0已完成prefill,加入decode + * {seq_id=0, num_tokens=1, start_pos=10, is_prefill=false} + * remaining_tokens = 32 - 1 = 31 + * + * 2. schedule_prefill() - seq1还有4个token, seq2还有9个token + * seq1: chunk_size = min(31, 16, 4) = 4 + * {seq_id=1, num_tokens=4, start_pos=16, is_prefill=true} + * remaining_tokens = 31 - 4 = 27 + * + * seq2: chunk_size = min(27, 16, 9) = 9 + * {seq_id=2, num_tokens=9, start_pos=6, is_prefill=true} + * remaining_tokens = 27 - 9 = 18 + * + * 3. try_admit_waiting() - waiting为空,跳过 + * + * 输出 SchedulerOutput: + * scheduled_seqs = [ + * {seq_id=0, kv_slot=0, num_tokens=1, is_prefill=false}, // decode + * {seq_id=1, kv_slot=1, num_tokens=4, is_prefill=true}, // prefill完成 + * {seq_id=2, kv_slot=2, num_tokens=9, is_prefill=true}, // prefill完成 + * ] + * num_tokens = 14 + * num_decode_tokens = 1, num_prefill_tokens = 13 + * + * 状态变化: + * seq0.prefill_pos = 10 (已完成), 进入decode + * seq1.prefill_pos = 16 → 20 (prefill完成) + * seq2.prefill_pos = 6 → 15 (prefill完成) + * + * --- + * 迭代N: 全部进入decode阶段 + * + * 所有序列prefill完成,每次迭代: + * scheduled_seqs = [ + * {seq_id=0, num_tokens=1, is_prefill=false}, + * {seq_id=1, num_tokens=1, is_prefill=false}, + * {seq_id=2, num_tokens=1, is_prefill=false}, + * ] + * num_tokens = 3, num_decode_tokens = 3 + * + * --- + * 序列完成: finish_sequence() + * + * // 检测到EOS或达到max_new_tokens + * scheduler.finish_sequence(seq_id=0); + * + * 1. seq0.status = FINISHED + * 2. running_queue_.erase(0) → [1, 2] + * 3. kv_manager_->release_sequence_slot(kv_slot=0) + * 4. finished_seqs_.push_back(0) + * + * --- + * 调度策略 (decode_priority=true): + * + * 优先级: Decode > Prefill续 > 新Prefill + * + * 原因: Decode只需1个token,延迟敏感 + * Prefill可以分chunk,吞吐优先 + * + * schedule() 执行顺序: + * 1. schedule_decode() - 先处理所有decode序列 + * 2. schedule_prefill() - 再处理running中的prefill + * 3. try_admit_waiting() - 最后admit新序列 + * + * --- + * 与其他组件的关系: + * + * 用户请求 + * │ + * ▼ + * Scheduler.add_request() + * │ + * ▼ + * Scheduler.schedule() ──────► SchedulerOutput + * │ │ + * │ admit_sequence() │ scheduled_seqs + * ▼ ▼ + * KVCacheManager BatchMetadata.create_mixed() + * │ │ + * │ allocate_sequence_slot() │ 构建GPU映射数组 + * │ allocate_blocks_for_tokens() │ + * ▼ ▼ + * BlockAllocator GPU Kernels + * + * + * 最终驱逐策略 + + 优先级 (从高到低驱逐): + ┌─────────────────────────────────────────────────────────────┐ + │ 1. Prefill 阶段序列 - 按 context_len 从小到大 │ + │ (还没产出,用户影响最小) │ + ├─────────────────────────────────────────────────────────────┤ + │ 2. Decode 阶段序列 - 按 num_generated 从小到大 │ + │ (已在生成,用户正在等待) │ + └─────────────────────────────────────────────────────────────┘ + + 示例 + + running_queue_ = [seq0, seq1, seq2, seq3] + + seq0: prefill, context_len=100 (prefill了100个token) + seq1: prefill, context_len=20 ← 第1个驱逐目标 (prefill最短) + seq2: decode, num_generated=50 (已生成50个) + seq3: decode, num_generated=5 ← 第2个驱逐目标 (decode最短) + + 驱逐顺序: + 1. seq1 (prefill, context=20) - 优先驱逐prefill中最短的 + 2. seq0 (prefill, context=100) - 其次prefill中第二短的 + 3. seq3 (decode, generated=5) - 再次decode中最短的 + 4. seq2 (decode, generated=50) - 最后decode中第二短的 + * + * + * + * + * + * + * + */ + +#include "base/scheduler.h" +#include "base/kv_cache_manager.h" +#include +#include + +namespace base { + +Scheduler::Scheduler(const SchedulerConfig& config) : config_(config) {} + +void Scheduler::set_kv_cache_manager(KVCacheManager* kv_manager) { + std::lock_guard lock(mutex_); + kv_manager_ = kv_manager; +} + +int32_t Scheduler::add_request(const std::vector& prompt_tokens, + int32_t max_new_tokens, + const std::string& original_text) { + std::lock_guard lock(mutex_); + + Sequence seq; + seq.seq_id = next_seq_id_++; + seq.prompt_tokens = prompt_tokens; + seq.max_new_tokens = max_new_tokens; + seq.original_text = original_text; + seq.status = SequenceStatus::WAITING; + seq.arrival_time = std::chrono::steady_clock::now(); + + int32_t seq_id = seq.seq_id; + sequences_[seq_id] = std::move(seq); + waiting_queue_.push_back(seq_id); + + return seq_id; +} + +SchedulerOutput Scheduler::schedule() { + std::lock_guard lock(mutex_); + + SchedulerOutput output; + int32_t remaining_tokens = config_.max_num_batched_tokens; + int32_t remaining_seqs = config_.max_num_seqs; + + // Step 0: Check memory pressure and preempt if necessary + // Estimate blocks needed for decode (1 token per running decode sequence) + int32_t decode_blocks_needed = 0; + for (int32_t seq_id : running_queue_) { + auto it = sequences_.find(seq_id); + if (it != sequences_.end() && !it->second.is_prefill()) { + // Each decode may need a new block if crossing block boundary + decode_blocks_needed += estimate_blocks_needed(it->second, 1) - + estimate_blocks_needed(it->second, 0); + } + } + + if (need_preemption(decode_blocks_needed)) { + int32_t preempted = preempt_sequences(decode_blocks_needed); + if (preempted > 0) { + for (int32_t i = 0; i < preempted; ++i) { + if (!preempted_queue_.empty()) { + output.preempted_seq_ids.push_back(preempted_queue_.back()); + } + } + } + } + + if (config_.decode_priority) { + // Step 1: Schedule decode sequences first (each needs only 1 token) + schedule_decode(output, remaining_tokens, remaining_seqs); + + // Step 2: Schedule running prefill sequences + schedule_prefill(output, remaining_tokens, remaining_seqs); + + // Step 3: Try to re-admit preempted sequences (priority over new) + try_admit_preempted(output, remaining_tokens, remaining_seqs); + + // Step 4: Try to admit new sequences from waiting queue + try_admit_waiting(output, remaining_tokens, remaining_seqs); + } else { + // FCFS: process in order + schedule_prefill(output, remaining_tokens, remaining_seqs); + schedule_decode(output, remaining_tokens, remaining_seqs); + try_admit_preempted(output, remaining_tokens, remaining_seqs); + try_admit_waiting(output, remaining_tokens, remaining_seqs); + } + + return output; +} + +void Scheduler::schedule_decode(SchedulerOutput& output, int32_t& remaining_tokens, + int32_t& remaining_seqs) { + for (int32_t seq_id : running_queue_) { + if (remaining_tokens < 1 || remaining_seqs < 1) break; + + auto it = sequences_.find(seq_id); + if (it == sequences_.end()) continue; + + Sequence& seq = it->second; + if (seq.status != SequenceStatus::RUNNING) continue; + if (seq.is_prefill()) continue; // Still in prefill phase + if (seq.next_token < 0) continue; // No token to decode + + add_to_schedule(output, seq, 1, false); + remaining_tokens -= 1; + remaining_seqs -= 1; + } +} + +void Scheduler::schedule_prefill(SchedulerOutput& output, int32_t& remaining_tokens, + int32_t& remaining_seqs) { + for (int32_t seq_id : running_queue_) { + if (remaining_tokens < 1 || remaining_seqs < 1) break; + + auto it = sequences_.find(seq_id); + if (it == sequences_.end()) continue; + + Sequence& seq = it->second; + if (seq.status != SequenceStatus::RUNNING) continue; + if (!seq.is_prefill()) continue; // Already in decode phase + + // Calculate chunk size + int32_t remaining_prefill = seq.remaining_prefill(); + int32_t chunk_size = std::min({ + remaining_tokens, + config_.max_prefill_tokens, + remaining_prefill + }); + + if (chunk_size > 0) { + add_to_schedule(output, seq, chunk_size, true); + remaining_tokens -= chunk_size; + remaining_seqs -= 1; + } + } +} + +void Scheduler::try_admit_waiting(SchedulerOutput& output, int32_t& remaining_tokens, + int32_t& remaining_seqs) { + while (!waiting_queue_.empty() && remaining_tokens > 0 && remaining_seqs > 0) { + int32_t seq_id = waiting_queue_.front(); + + auto it = sequences_.find(seq_id); + if (it == sequences_.end()) { + waiting_queue_.pop_front(); + continue; + } + + Sequence& seq = it->second; + + // Check if we can admit this sequence + if (!can_admit_new_sequence()) break; + + // Check memory pressure before admitting + int32_t blocks_needed = estimate_blocks_needed(seq, seq.prompt_len()); + if (need_preemption(blocks_needed)) break; + + // Try to allocate KV slot + if (!admit_sequence(seq)) break; + + waiting_queue_.pop_front(); + running_queue_.push_back(seq_id); + + // Schedule first prefill chunk + int32_t chunk_size = std::min({ + remaining_tokens, + config_.max_prefill_tokens, + seq.prompt_len() + }); + + if (chunk_size > 0) { + add_to_schedule(output, seq, chunk_size, true); + remaining_tokens -= chunk_size; + remaining_seqs -= 1; + } + } +} + +void Scheduler::try_admit_preempted(SchedulerOutput& output, int32_t& remaining_tokens, + int32_t& remaining_seqs) { + // Re-admit preempted sequences with priority over new requests + // Preempted sequences need to restart from beginning (KV cache was released) + while (!preempted_queue_.empty() && remaining_tokens > 0 && remaining_seqs > 0) { + int32_t seq_id = preempted_queue_.front(); + + auto it = sequences_.find(seq_id); + if (it == sequences_.end()) { + preempted_queue_.pop_front(); + continue; + } + + Sequence& seq = it->second; + + // Check if we can admit this sequence + if (!can_admit_new_sequence()) break; + + // Check memory pressure before admitting + int32_t blocks_needed = estimate_blocks_needed(seq, seq.prompt_len()); + if (need_preemption(blocks_needed)) break; + + // Reset sequence state for re-prefill + seq.prefill_pos = 0; + seq.output_tokens.clear(); + seq.next_token = -1; + + // Try to allocate KV slot + if (!admit_sequence(seq)) break; + + preempted_queue_.pop_front(); + running_queue_.push_back(seq_id); + + // Schedule first prefill chunk + int32_t chunk_size = std::min({ + remaining_tokens, + config_.max_prefill_tokens, + seq.prompt_len() + }); + + if (chunk_size > 0) { + add_to_schedule(output, seq, chunk_size, true); + remaining_tokens -= chunk_size; + remaining_seqs -= 1; + } + + LOG(INFO) << "Re-admitted preempted sequence " << seq_id + << " (preemptions=" << seq.num_preemptions << ")"; + } +} + +void Scheduler::add_to_schedule(SchedulerOutput& output, Sequence& seq, + int32_t num_tokens, bool is_prefill) { + ScheduledSequence sched_seq; + sched_seq.seq_id = seq.seq_id; + sched_seq.kv_slot = seq.kv_slot; + sched_seq.num_tokens = num_tokens; + sched_seq.is_prefill = is_prefill; + + if (is_prefill) { + // Prefill: extract tokens from prompt + sched_seq.start_pos = seq.prefill_pos; + sched_seq.context_len = seq.prefill_pos + num_tokens; + sched_seq.tokens.assign( + seq.prompt_tokens.begin() + seq.prefill_pos, + seq.prompt_tokens.begin() + seq.prefill_pos + num_tokens + ); + output.num_prefill_tokens += num_tokens; + output.num_prefill_seqs += 1; + } else { + // Decode: single token + sched_seq.start_pos = seq.context_len(); + sched_seq.context_len = seq.context_len() + 1; + sched_seq.tokens = {seq.next_token}; + output.num_decode_tokens += 1; + output.num_decode_seqs += 1; + } + + output.num_tokens += num_tokens; + output.scheduled_seqs.push_back(std::move(sched_seq)); +} + +void Scheduler::update_after_forward(const SchedulerOutput& output, + const std::vector& next_tokens) { + std::lock_guard lock(mutex_); + + for (size_t i = 0; i < output.scheduled_seqs.size(); ++i) { + const auto& sched_seq = output.scheduled_seqs[i]; + auto it = sequences_.find(sched_seq.seq_id); + if (it == sequences_.end()) continue; + + Sequence& seq = it->second; + + if (sched_seq.is_prefill) { + // Update prefill progress + seq.prefill_pos += sched_seq.num_tokens; + + // If prefill just completed, record first token time + if (!seq.is_prefill() && seq.first_token_time.time_since_epoch().count() == 0) { + seq.first_token_time = std::chrono::steady_clock::now(); + } + } else { + // Decode: add the token we just processed to output + seq.output_tokens.push_back(seq.next_token); + } + + // Update next token from model output + if (i < next_tokens.size()) { + seq.next_token = next_tokens[i]; + } + + // Update KV cache context length + if (kv_manager_ != nullptr && seq.kv_slot >= 0) { + kv_manager_->set_context_len(seq.kv_slot, seq.context_len()); + } + } +} + +void Scheduler::finish_sequence(int32_t seq_id) { + std::lock_guard lock(mutex_); + + auto it = sequences_.find(seq_id); + if (it == sequences_.end()) return; + + Sequence& seq = it->second; + if (seq.status == SequenceStatus::FINISHED) return; + + seq.status = SequenceStatus::FINISHED; + + // Remove from running queue + auto run_it = std::find(running_queue_.begin(), running_queue_.end(), seq_id); + if (run_it != running_queue_.end()) { + running_queue_.erase(run_it); + } + + // Release KV slot via KVCacheManager + if (seq.kv_slot >= 0 && kv_manager_ != nullptr) { + kv_manager_->release_sequence_slot(seq.kv_slot); + seq.kv_slot = -1; + } + + finished_seqs_.push_back(seq_id); +} + +const Sequence* Scheduler::get_sequence(int32_t seq_id) const { + std::lock_guard lock(mutex_); + auto it = sequences_.find(seq_id); + if (it == sequences_.end()) return nullptr; + return &it->second; +} + +Sequence* Scheduler::get_sequence_mut(int32_t seq_id) { + std::lock_guard lock(mutex_); + auto it = sequences_.find(seq_id); + if (it == sequences_.end()) return nullptr; + return &it->second; +} + +bool Scheduler::all_finished() const { + std::lock_guard lock(mutex_); + return waiting_queue_.empty() && running_queue_.empty() && preempted_queue_.empty(); +} + +bool Scheduler::has_pending_work() const { + std::lock_guard lock(mutex_); + return !waiting_queue_.empty() || !running_queue_.empty() || !preempted_queue_.empty(); +} + +int32_t Scheduler::num_waiting() const { + std::lock_guard lock(mutex_); + return static_cast(waiting_queue_.size()); +} + +int32_t Scheduler::num_running() const { + std::lock_guard lock(mutex_); + return static_cast(running_queue_.size()); +} + +int32_t Scheduler::num_preempted() const { + std::lock_guard lock(mutex_); + return static_cast(preempted_queue_.size()); +} + +int32_t Scheduler::num_finished() const { + std::lock_guard lock(mutex_); + return static_cast(finished_seqs_.size()); +} + +std::vector Scheduler::get_finished_sequences() const { + std::lock_guard lock(mutex_); + std::vector result; + result.reserve(finished_seqs_.size()); + for (int32_t seq_id : finished_seqs_) { + auto it = sequences_.find(seq_id); + if (it != sequences_.end()) { + result.push_back(&it->second); + } + } + return result; +} + +std::vector Scheduler::get_all_sequences() const { + std::lock_guard lock(mutex_); + std::vector result; + result.reserve(sequences_.size()); + for (const auto& pair : sequences_) { + result.push_back(&pair.second); + } + // Sort by seq_id for consistent ordering + std::sort(result.begin(), result.end(), + [](const Sequence* a, const Sequence* b) { + return a->seq_id < b->seq_id; + }); + return result; +} + +bool Scheduler::can_admit_new_sequence() const { + // Check if KVCacheManager has free slots + if (kv_manager_ == nullptr) { + return false; + } + return kv_manager_->has_free_slot(); +} + +int32_t Scheduler::estimate_blocks_needed(const Sequence& seq, int32_t additional_tokens) const { + int32_t total_tokens = seq.context_len() + additional_tokens; + return (total_tokens + config_.block_size - 1) / config_.block_size; +} + +bool Scheduler::admit_sequence(Sequence& seq) { + if (kv_manager_ == nullptr) { + return false; + } + + // Allocate KV slot via KVCacheManager + int32_t kv_slot = kv_manager_->allocate_sequence_slot(); + if (kv_slot < 0) { + return false; + } + + seq.kv_slot = kv_slot; + seq.status = SequenceStatus::RUNNING; + + // Pre-allocate KV cache blocks for prompt + if (!kv_manager_->allocate_blocks_for_tokens(kv_slot, seq.prompt_len())) { + // Failed to allocate blocks, release slot and return + kv_manager_->release_sequence_slot(kv_slot); + seq.kv_slot = -1; + seq.status = SequenceStatus::WAITING; + return false; + } + + return true; +} + +bool Scheduler::need_preemption(int32_t blocks_needed) const { + if (kv_manager_ == nullptr) { + return false; + } + int32_t free_blocks = kv_manager_->num_free_blocks(); + // Need preemption if free blocks < needed + min_free_blocks buffer + return free_blocks < (blocks_needed + config_.min_free_blocks); +} + +int32_t Scheduler::select_victim_sequence() const { + // Strategy (priority order): + // 1. Prefill sequences: select by smallest context_len (least computation invested) + // 2. Decode sequences: select by smallest num_generated (least output produced) + // + // Rationale: + // - Prefill sequences haven't produced any output yet, user impact is minimal + // - Decode sequences are actively generating, user is waiting for output + + int32_t victim_id = -1; + int32_t min_metric = INT32_MAX; + bool found_prefill_victim = false; + + // First pass: look for prefill sequences (higher priority to evict) + for (int32_t seq_id : running_queue_) { + auto it = sequences_.find(seq_id); + if (it == sequences_.end()) continue; + + const Sequence& seq = it->second; + if (seq.status != SequenceStatus::RUNNING) continue; + + if (seq.is_prefill()) { + // Prefill: use context_len as metric + int32_t ctx_len = seq.context_len(); + if (ctx_len < min_metric) { + victim_id = seq_id; + min_metric = ctx_len; + found_prefill_victim = true; + } + } + } + + // If found a prefill victim, return it + if (found_prefill_victim) { + return victim_id; + } + + // Second pass: look for decode sequences (lower priority to evict) + min_metric = INT32_MAX; + for (int32_t seq_id : running_queue_) { + auto it = sequences_.find(seq_id); + if (it == sequences_.end()) continue; + + const Sequence& seq = it->second; + if (seq.status != SequenceStatus::RUNNING) continue; + + if (!seq.is_prefill()) { + // Decode: use num_generated as metric + int32_t generated = seq.num_generated(); + if (generated < min_metric) { + victim_id = seq_id; + min_metric = generated; + } + } + } + + return victim_id; +} + +int32_t Scheduler::preempt_sequences(int32_t blocks_needed) { + int32_t num_preempted = 0; + + while (need_preemption(blocks_needed) && !running_queue_.empty()) { + int32_t victim_id = select_victim_sequence(); + if (victim_id < 0) break; + + auto it = sequences_.find(victim_id); + if (it == sequences_.end()) break; + + Sequence& victim = it->second; + + // Release KV resources + if (victim.kv_slot >= 0 && kv_manager_ != nullptr) { + kv_manager_->release_sequence_slot(victim.kv_slot); + victim.kv_slot = -1; + } + + // Update status + victim.status = SequenceStatus::PREEMPTED; + victim.num_preemptions++; + + // Remove from running queue + auto run_it = std::find(running_queue_.begin(), running_queue_.end(), victim_id); + if (run_it != running_queue_.end()) { + running_queue_.erase(run_it); + } + + // Add to preempted queue (will be rescheduled with priority) + preempted_queue_.push_back(victim_id); + + num_preempted++; + LOG(WARNING) << "Preempted sequence " << victim_id + << " (generated=" << victim.num_generated() + << ", preemptions=" << victim.num_preemptions << ")"; + } + + return num_preempted; +} + +} // namespace base \ No newline at end of file diff --git a/kuiper/source/model/llama3.cpp b/kuiper/source/model/llama3.cpp index 3b0d255..c64fbd1 100644 --- a/kuiper/source/model/llama3.cpp +++ b/kuiper/source/model/llama3.cpp @@ -1,6 +1,7 @@ #include "model/llama3.h" #include #include +#include #include #include #include @@ -8,6 +9,8 @@ #include #include "../op/kernels/cpu/rope_kernel.h" #include "../op/kernels/cuda/rope_kernel.cuh" +#include "../op/kernels/cuda/paged_attention_kernel.cuh" +#include "../op/kernels/kernels_interface.h" #include "base/tick.h" namespace model { @@ -153,12 +156,18 @@ base::Status LLama2Model::forward(const tensor::Tensor& input, const tensor::Ten return base::error::InternalError("Unsupported int8 quant in the cpu device"); } + const bool use_paged_attention = (kv_cache_manager_ != nullptr); + for (int32_t layer_idx = 0; layer_idx < config_->layer_num_; ++layer_idx) { attention_rms(layer_idx, input); // attention (wq wk wv @ input) attention_qkv(layer_idx, pos_tensor); // multi-head attention - attention_mha(layer_idx, pos_tensor); + if (use_paged_attention) { + attention_mha_paged(layer_idx, pos_tensor); + } else { + attention_mha(layer_idx, pos_tensor); + } // feed forward feed_forward(layer_idx, input); } @@ -675,6 +684,61 @@ void LLama2Model::attention_mha(int32_t layer_idx, const tensor::Tensor& pos_ten STATUS_CHECK(wo_layer->forward(mha_output, attn_output)); } +void LLama2Model::attention_mha_paged(int32_t layer_idx, const tensor::Tensor& pos_tensor) const { + CHECK(llama_layers_ != nullptr); + CHECK(kv_cache_manager_ != nullptr) << "KV cache manager is null for paged attention"; + + tensor::Tensor mha_output = get_buffer(ModelBufferType::kOutputMHA); + tensor::Tensor query = this->get_buffer(ModelBufferType::kQuery); + + // Get paged KV cache tensors + const tensor::Tensor& kv_cache = kv_cache_manager_->kv_cache_device(); + const tensor::Tensor& block_tables = kv_cache_manager_->block_tables_device(); + const tensor::Tensor& context_lens = kv_cache_manager_->context_lens_device(); + + // Get parameters from config and kv_cache_manager + const int32_t num_heads = config_->head_num_; + const int32_t num_kv_heads = config_->kv_head_num_; + const int32_t head_dim = config_->head_size_; + const int32_t num_layers = config_->layer_num_; + const int32_t block_size = kv_cache_manager_->block_size(); + const int32_t max_blocks_per_seq = kv_cache_manager_->max_blocks_per_seq(); + // Get num_seqs from KVCacheManager instead of hardcoding + const int32_t num_seqs = kv_cache_manager_->num_seqs(); + const int32_t max_context_len = kv_cache_manager_->get_context_len(0); + + // Scale factor for attention: 1/sqrt(head_dim) + const float scale = 1.0f / std::sqrt(static_cast(head_dim)); + + // Get CUDA stream + cudaStream_t stream = cuda_config_ ? cuda_config_->stream : nullptr; + + // Call paged attention kernel + kernel::paged_attention_kernel_cu( + mha_output, + query, + kv_cache, + block_tables, + context_lens, + max_context_len, + num_seqs, + num_heads, + num_kv_heads, + head_dim, + block_size, + max_blocks_per_seq, + scale, + layer_idx, + num_layers, + stream); + + // wo @ attention output + tensor::Tensor attn_output = get_buffer(ModelBufferType::kAttnOutput); + const auto& wo_layer = llama_layers_->wo_layers_.at(layer_idx); + CHECK_NE(wo_layer, nullptr) << "The weight output layer is null pointer."; + STATUS_CHECK(wo_layer->forward(mha_output, attn_output)); +} + void LLama2Model::feed_forward(int32_t layer_idx, const tensor::Tensor& input) const { CHECK(llama_layers_ != nullptr); // residual add @@ -744,4 +808,522 @@ int32_t LLama2Model::post_processing(const tensor::Tensor& pos, bool is_prompt) return next; } +void LLama2Model::init_batch_buffers(int32_t num_seqs) { + if (num_seqs <= 1) { + return; // No need for batch buffers with single sequence + } + + std::shared_ptr alloc; + if (device_type_ == base::DeviceType::kDeviceCPU) { + alloc = base::CPUDeviceAllocatorFactory::get_instance(); + } else { + alloc = base::CUDADeviceAllocatorFactory::get_instance(); + } + std::shared_ptr alloc_cpu = + base::CPUDeviceAllocatorFactory::get_instance(); + + // Batch input embeddings: [num_seqs, dim] + tensor::Tensor batch_input(base::DataType::kDataTypeFp32, num_seqs, config_->dim_, true, alloc); + CHECK(insert_buffer(ModelBufferType::kBatchInputEmbeddings, batch_input)); + + // Batch positions: [num_seqs] on CPU (for reading), and GPU copy + tensor::Tensor batch_positions_cpu(base::DataType::kDataTypeInt32, num_seqs, true, alloc_cpu); + tensor::Tensor batch_positions_gpu(base::DataType::kDataTypeInt32, num_seqs, true, alloc); + CHECK(insert_buffer(ModelBufferType::kBatchPositions, batch_positions_gpu)); + + // Batch RMSNorm output: [num_seqs, dim] + tensor::Tensor batch_rms_output(base::DataType::kDataTypeFp32, num_seqs, config_->dim_, true, alloc); + CHECK(insert_buffer(ModelBufferType::kBatchRMSNormOutput, batch_rms_output)); + + // Batch Query: [num_seqs, dim] + tensor::Tensor batch_query(base::DataType::kDataTypeFp32, num_seqs, config_->dim_, true, alloc); + CHECK(insert_buffer(ModelBufferType::kBatchQuery, batch_query)); + + // Batch Key: [num_seqs, kv_dim] + tensor::Tensor batch_key(base::DataType::kDataTypeFp32, num_seqs, config_->kv_dim_, true, alloc); + CHECK(insert_buffer(ModelBufferType::kBatchKey, batch_key)); + + // Batch Value: [num_seqs, kv_dim] + tensor::Tensor batch_value(base::DataType::kDataTypeFp32, num_seqs, config_->kv_dim_, true, alloc); + CHECK(insert_buffer(ModelBufferType::kBatchValue, batch_value)); + + // Batch MHA output: [num_seqs, dim] + tensor::Tensor batch_mha_output(base::DataType::kDataTypeFp32, num_seqs, config_->dim_, true, alloc); + CHECK(insert_buffer(ModelBufferType::kBatchOutputMHA, batch_mha_output)); + + // Batch Attention output: [num_seqs, dim] + tensor::Tensor batch_attn_output(base::DataType::kDataTypeFp32, num_seqs, config_->dim_, true, alloc); + CHECK(insert_buffer(ModelBufferType::kBatchAttnOutput, batch_attn_output)); + + // Batch FFN RMSNorm: [num_seqs, dim] + tensor::Tensor batch_ffn_rms(base::DataType::kDataTypeFp32, num_seqs, config_->dim_, true, alloc); + CHECK(insert_buffer(ModelBufferType::kBatchFFNRMSNorm, batch_ffn_rms)); + + // Batch W1 output: [num_seqs, hidden_dim] + tensor::Tensor batch_w1_output(base::DataType::kDataTypeFp32, num_seqs, config_->hidden_dim_, true, alloc); + CHECK(insert_buffer(ModelBufferType::kBatchW1Output, batch_w1_output)); + + // Batch W3 output: [num_seqs, hidden_dim] + tensor::Tensor batch_w3_output(base::DataType::kDataTypeFp32, num_seqs, config_->hidden_dim_, true, alloc); + CHECK(insert_buffer(ModelBufferType::kBatchW3Output, batch_w3_output)); + + // Batch W2 output: [num_seqs, dim] + tensor::Tensor batch_w2_output(base::DataType::kDataTypeFp32, num_seqs, config_->dim_, true, alloc); + CHECK(insert_buffer(ModelBufferType::kBatchW2Output, batch_w2_output)); + + // Batch forward output: [num_seqs, vocab_size] - logits for each sequence + tensor::Tensor batch_forward_output(base::DataType::kDataTypeFp32, num_seqs, config_->vocab_size_, true, alloc); + CHECK(insert_buffer(ModelBufferType::kBatchForwardOutput, batch_forward_output)); + + LOG(INFO) << "Initialized batch buffers for " << num_seqs << " sequences"; +} + +base::Status LLama2Model::forward_batch(const tensor::Tensor& inputs, + const tensor::Tensor& positions, + std::vector& next_tokens) const { + // inputs: [num_seqs, dim] + // positions: [num_seqs] on GPU + if (inputs.is_empty()) { + return base::error::InvalidArgument("The input tensor is empty."); + } + + const int32_t num_seqs = kv_cache_manager_->num_seqs(); + if (num_seqs <= 1) { + return base::error::InvalidArgument("forward_batch requires num_seqs > 1"); + } + + // Get batch buffers + tensor::Tensor batch_rms_output = get_buffer(ModelBufferType::kBatchRMSNormOutput); + tensor::Tensor batch_query = get_buffer(ModelBufferType::kBatchQuery); + tensor::Tensor batch_key = get_buffer(ModelBufferType::kBatchKey); + tensor::Tensor batch_value = get_buffer(ModelBufferType::kBatchValue); + tensor::Tensor batch_mha_output = get_buffer(ModelBufferType::kBatchOutputMHA); + tensor::Tensor batch_attn_output = get_buffer(ModelBufferType::kBatchAttnOutput); + tensor::Tensor batch_ffn_rms = get_buffer(ModelBufferType::kBatchFFNRMSNorm); + tensor::Tensor batch_w1_output = get_buffer(ModelBufferType::kBatchW1Output); + tensor::Tensor batch_w3_output = get_buffer(ModelBufferType::kBatchW3Output); + tensor::Tensor batch_w2_output = get_buffer(ModelBufferType::kBatchW2Output); + + // Get paged KV cache tensors + const tensor::Tensor& kv_cache = kv_cache_manager_->kv_cache_device(); + const tensor::Tensor& block_tables = kv_cache_manager_->block_tables_device(); + const tensor::Tensor& context_lens = kv_cache_manager_->context_lens_device(); + + // Get parameters + const int32_t num_heads = config_->head_num_; + const int32_t num_kv_heads = config_->kv_head_num_; + const int32_t head_dim = config_->head_size_; + const int32_t num_layers = config_->layer_num_; + const int32_t block_size = kv_cache_manager_->block_size(); + const int32_t max_blocks_per_seq = kv_cache_manager_->max_blocks_per_seq(); + const int32_t dim = config_->dim_; + const int32_t kv_dim = config_->kv_dim_; + const float scale = 1.0f / std::sqrt(static_cast(head_dim)); + + cudaStream_t stream = cuda_config_ ? cuda_config_->stream : nullptr; + + // Copy input to working buffer + tensor::Tensor working_input = inputs; // [num_seqs, dim] + + for (int32_t layer_idx = 0; layer_idx < num_layers; ++layer_idx) { + // 1. Attention RMSNorm (batch) - use dim version for batch + auto rmsnorm_layer = std::dynamic_pointer_cast( + llama_layers_->rmsnorm_layers_.at(layer_idx)); + kernel::get_rmsnorm_dim_kernel(device_type_)(working_input, rmsnorm_layer->get_weight(0), + batch_rms_output, dim, stream); + + // 2. QKV projection (batch GEMM) + // Q = rms_output @ Wq^T + auto wq_layer = std::dynamic_pointer_cast( + llama_layers_->wq_layers_.at(layer_idx)); + kernel::get_gemm_kernel(device_type_)(batch_rms_output, wq_layer->get_weight(0), + batch_query, cuda_config_.get()); + + // K = rms_output @ Wk^T + auto wk_layer = std::dynamic_pointer_cast( + llama_layers_->wk_layers_.at(layer_idx)); + kernel::get_gemm_kernel(device_type_)(batch_rms_output, wk_layer->get_weight(0), + batch_key, cuda_config_.get()); + + // V = rms_output @ Wv^T + auto wv_layer = std::dynamic_pointer_cast( + llama_layers_->wv_layers_.at(layer_idx)); + kernel::get_gemm_kernel(device_type_)(batch_rms_output, wv_layer->get_weight(0), + batch_value, cuda_config_.get()); + + // 3. RoPE (batch) + kernel::get_rope_batch_kernel(device_type_)( + num_seqs, dim, kv_dim, head_dim, + batch_query, batch_key, positions, + get_buffer(ModelBufferType::kSinCache), + get_buffer(ModelBufferType::kCosCache), + stream); + + // 4. Write K, V to paged KV cache (batch) + kernel::batch_kv_cache_write_cu( + const_cast(kv_cache), + batch_key, + batch_value, + block_tables, + positions, + num_seqs, + num_kv_heads, + head_dim, + block_size, + max_blocks_per_seq, + layer_idx, + num_layers, + stream); + + // 5. Paged Attention (already supports batch) + // Get max context len across all sequences + int32_t max_context_len = 0; + for (int32_t s = 0; s < num_seqs; ++s) { + int32_t ctx_len = kv_cache_manager_->get_context_len(s); + if (ctx_len > max_context_len) max_context_len = ctx_len; + } + + kernel::paged_attention_kernel_cu( + batch_mha_output, + batch_query, + kv_cache, + block_tables, + context_lens, + max_context_len, + num_seqs, + num_heads, + num_kv_heads, + head_dim, + block_size, + max_blocks_per_seq, + scale, + layer_idx, + num_layers, + stream); + + // 6. Output projection: O = mha_output @ Wo^T + auto wo_layer = std::dynamic_pointer_cast( + llama_layers_->wo_layers_.at(layer_idx)); + kernel::get_gemm_kernel(device_type_)(batch_mha_output, wo_layer->get_weight(0), + batch_attn_output, cuda_config_.get()); + + // 7. Residual add: working_input += attn_output + kernel::get_add_kernel(device_type_)(working_input, batch_attn_output, working_input, stream); + + // 8. FFN RMSNorm + auto ffn_rmsnorm = std::dynamic_pointer_cast( + llama_layers_->rmsnorm_layers_.at(layer_idx + num_layers)); + kernel::get_rmsnorm_dim_kernel(device_type_)(working_input, ffn_rmsnorm->get_weight(0), + batch_ffn_rms, dim, stream); + + // 9. FFN: W1 and W3 + auto w1_layer = std::dynamic_pointer_cast( + llama_layers_->w1_layers_.at(layer_idx)); + kernel::get_gemm_kernel(device_type_)(batch_ffn_rms, w1_layer->get_weight(0), + batch_w1_output, cuda_config_.get()); + + auto w3_layer = std::dynamic_pointer_cast( + llama_layers_->w3_layers_.at(layer_idx)); + kernel::get_gemm_kernel(device_type_)(batch_ffn_rms, w3_layer->get_weight(0), + batch_w3_output, cuda_config_.get()); + + // 10. SwiGLU + kernel::get_swiglu_kernel(device_type_)(batch_w1_output, batch_w3_output, batch_w1_output, stream); + + // 11. W2 + auto w2_layer = std::dynamic_pointer_cast( + llama_layers_->w2_layers_.at(layer_idx)); + kernel::get_gemm_kernel(device_type_)(batch_w1_output, w2_layer->get_weight(0), + batch_w2_output, cuda_config_.get()); + + // 12. Residual add + kernel::get_add_kernel(device_type_)(working_input, batch_w2_output, working_input, stream); + } + + // Final RMSNorm + auto final_rmsnorm = std::dynamic_pointer_cast( + llama_layers_->rmsnorm_layers_.at(2 * num_layers)); + kernel::get_rmsnorm_dim_kernel(device_type_)(working_input, final_rmsnorm->get_weight(0), + batch_rms_output, dim, stream); + + // Classification layer: logits = rms_output @ cls_weight^T + // Output: [num_seqs, vocab_size] + tensor::Tensor batch_forward_output = get_buffer(ModelBufferType::kBatchForwardOutput); + auto cls_layer = std::dynamic_pointer_cast(llama_layers_->cls_layer_); + kernel::get_gemm_kernel(device_type_)(batch_rms_output, cls_layer->get_weight(0), + batch_forward_output, cuda_config_.get()); + + cudaStreamSynchronize(stream); + + // Sample next token for each sequence + next_tokens.resize(num_seqs); + const int32_t vocab_size = config_->vocab_size_; + const float* logits_ptr = batch_forward_output.ptr(); + + for (int32_t s = 0; s < num_seqs; ++s) { + const float* seq_logits = logits_ptr + s * vocab_size; + next_tokens[s] = static_cast(sampler_->sample(seq_logits, vocab_size, stream)); + } + + return base::error::Success(); +} + +// ============================================================================ +// Unified forward implementation +// Supports: single decode, batch decode, chunked prefill, mixed prefill+decode +// ============================================================================ + +void LLama2Model::ensure_unified_buffers(int32_t num_tokens) const { + // Cast away const for buffer initialization (buffers are mutable cache) + const_cast(this)->init_unified_buffers(num_tokens); +} + +base::Status LLama2Model::forward_unified( + const tensor::Tensor& hidden_states, + const base::BatchMetadata& batch_meta, + std::vector& next_tokens) const { + + if (hidden_states.is_empty()) { + return base::error::InvalidArgument("The hidden_states tensor is empty."); + } + + CHECK(kv_cache_manager_ != nullptr) << "KV cache manager not initialized for unified forward"; + + const int32_t num_tokens = batch_meta.num_tokens; + const int32_t num_seqs = batch_meta.num_seqs; + const bool has_prefill = batch_meta.has_prefill(); + const bool has_decode = batch_meta.has_decode(); + + // Ensure we have buffers for this many tokens + ensure_unified_buffers(num_tokens); + + cudaStream_t stream = cuda_config_ ? cuda_config_->stream : nullptr; + + // Get unified buffers + tensor::Tensor unified_rms_output = get_buffer(ModelBufferType::kUnifiedRMSNormOutput); + tensor::Tensor unified_query = get_buffer(ModelBufferType::kUnifiedQuery); + tensor::Tensor unified_key = get_buffer(ModelBufferType::kUnifiedKey); + tensor::Tensor unified_value = get_buffer(ModelBufferType::kUnifiedValue); + tensor::Tensor unified_mha_output = get_buffer(ModelBufferType::kUnifiedMHAOutput); + tensor::Tensor unified_attn_output = get_buffer(ModelBufferType::kUnifiedAttnOutput); + tensor::Tensor unified_ffn_rms = get_buffer(ModelBufferType::kUnifiedFFNRMSNorm); + tensor::Tensor unified_w1_output = get_buffer(ModelBufferType::kUnifiedW1Output); + tensor::Tensor unified_w3_output = get_buffer(ModelBufferType::kUnifiedW3Output); + tensor::Tensor unified_w2_output = get_buffer(ModelBufferType::kUnifiedW2Output); + + // Reshape buffers for current num_tokens + unified_rms_output.reshape({num_tokens, config_->dim_}); + unified_query.reshape({num_tokens, config_->dim_}); + unified_key.reshape({num_tokens, config_->kv_dim_}); + unified_value.reshape({num_tokens, config_->kv_dim_}); + unified_mha_output.reshape({num_tokens, config_->dim_}); + unified_attn_output.reshape({num_tokens, config_->dim_}); + unified_ffn_rms.reshape({num_tokens, config_->dim_}); + unified_w1_output.reshape({num_tokens, config_->hidden_dim_}); + unified_w3_output.reshape({num_tokens, config_->hidden_dim_}); + unified_w2_output.reshape({num_tokens, config_->dim_}); + + // Get paged KV cache tensors + const tensor::Tensor& kv_cache = kv_cache_manager_->kv_cache_device(); + const tensor::Tensor& block_tables = kv_cache_manager_->block_tables_device(); + const tensor::Tensor& context_lens = kv_cache_manager_->context_lens_device(); + + // Get parameters + const int32_t num_heads = config_->head_num_; + const int32_t num_kv_heads = config_->kv_head_num_; + const int32_t head_dim = config_->head_size_; + const int32_t num_layers = config_->layer_num_; + const int32_t block_size = kv_cache_manager_->block_size(); + const int32_t max_blocks_per_seq = kv_cache_manager_->max_blocks_per_seq(); + const int32_t dim = config_->dim_; + const int32_t kv_dim = config_->kv_dim_; + const float scale = 1.0f / std::sqrt(static_cast(head_dim)); + + // Working tensor - copy of hidden_states that we modify in-place + tensor::Tensor working_input = hidden_states; // [num_tokens, dim] + + for (int32_t layer_idx = 0; layer_idx < num_layers; ++layer_idx) { + // 1. Attention RMSNorm: [num_tokens, dim] -> [num_tokens, dim] + auto rmsnorm_layer = std::dynamic_pointer_cast( + llama_layers_->rmsnorm_layers_.at(layer_idx)); + kernel::get_rmsnorm_dim_kernel(device_type_)( + working_input, rmsnorm_layer->get_weight(0), unified_rms_output, dim, stream); + + // 2. QKV Projection using GEMM + auto wq_layer = std::dynamic_pointer_cast( + llama_layers_->wq_layers_.at(layer_idx)); + kernel::get_gemm_kernel(device_type_)( + unified_rms_output, wq_layer->get_weight(0), unified_query, cuda_config_.get()); + + auto wk_layer = std::dynamic_pointer_cast( + llama_layers_->wk_layers_.at(layer_idx)); + kernel::get_gemm_kernel(device_type_)( + unified_rms_output, wk_layer->get_weight(0), unified_key, cuda_config_.get()); + + auto wv_layer = std::dynamic_pointer_cast( + llama_layers_->wv_layers_.at(layer_idx)); + kernel::get_gemm_kernel(device_type_)( + unified_rms_output, wv_layer->get_weight(0), unified_value, cuda_config_.get()); + + // 3. RoPE: Apply rotary position embeddings + kernel::get_rope_batch_kernel(device_type_)( + num_tokens, dim, kv_dim, head_dim, + unified_query, unified_key, batch_meta.positions, + get_buffer(ModelBufferType::kSinCache), + get_buffer(ModelBufferType::kCosCache), + stream); + + // 4. Write K, V to paged KV cache using v2 kernel with seq_ids + kernel::batch_kv_cache_write_v2_cu( + const_cast(kv_cache), + unified_key, + unified_value, + block_tables, + batch_meta.positions, + batch_meta.seq_ids, + num_tokens, + num_kv_heads, + head_dim, + block_size, + max_blocks_per_seq, + layer_idx, + num_layers, + stream); + + // 5. Attention: Choose kernel based on scenario + int32_t max_context_len = batch_meta.max_context_len(); + + if (has_prefill) { + // Use prefill and decode mixed kernel + kernel::paged_attention_prefill_cu( + unified_mha_output, + unified_query, + kv_cache, + block_tables, + context_lens, + batch_meta.seq_ids, + batch_meta.positions, + num_tokens, + num_seqs, + num_heads, + num_kv_heads, + head_dim, + block_size, + max_blocks_per_seq, + scale, + layer_idx, + num_layers, + stream); + } else { + // Pure decode: use standard paged attention kernel + kernel::paged_attention_kernel_cu( + unified_mha_output, + unified_query, + kv_cache, + block_tables, + context_lens, + max_context_len, + num_seqs, + num_heads, + num_kv_heads, + head_dim, + block_size, + max_blocks_per_seq, + scale, + layer_idx, + num_layers, + stream); + } + + // 6. Output projection: O = mha_output @ Wo^T + auto wo_layer = std::dynamic_pointer_cast( + llama_layers_->wo_layers_.at(layer_idx)); + kernel::get_gemm_kernel(device_type_)( + unified_mha_output, wo_layer->get_weight(0), unified_attn_output, cuda_config_.get()); + + // 7. Residual add: working_input += attn_output + kernel::get_add_kernel(device_type_)(working_input, unified_attn_output, working_input, stream); + + // 8. FFN RMSNorm + auto ffn_rmsnorm = std::dynamic_pointer_cast( + llama_layers_->rmsnorm_layers_.at(layer_idx + num_layers)); + kernel::get_rmsnorm_dim_kernel(device_type_)( + working_input, ffn_rmsnorm->get_weight(0), unified_ffn_rms, dim, stream); + + // 9. FFN: W1 and W3 + auto w1_layer = std::dynamic_pointer_cast( + llama_layers_->w1_layers_.at(layer_idx)); + kernel::get_gemm_kernel(device_type_)( + unified_ffn_rms, w1_layer->get_weight(0), unified_w1_output, cuda_config_.get()); + + auto w3_layer = std::dynamic_pointer_cast( + llama_layers_->w3_layers_.at(layer_idx)); + kernel::get_gemm_kernel(device_type_)( + unified_ffn_rms, w3_layer->get_weight(0), unified_w3_output, cuda_config_.get()); + + // 10. SwiGLU + kernel::get_swiglu_kernel(device_type_)(unified_w1_output, unified_w3_output, unified_w1_output, stream); + + // 11. W2 + auto w2_layer = std::dynamic_pointer_cast( + llama_layers_->w2_layers_.at(layer_idx)); + kernel::get_gemm_kernel(device_type_)( + unified_w1_output, w2_layer->get_weight(0), unified_w2_output, cuda_config_.get()); + + // 12. Residual add + kernel::get_add_kernel(device_type_)(working_input, unified_w2_output, working_input, stream); + } + + // Final RMSNorm + auto final_rmsnorm = std::dynamic_pointer_cast( + llama_layers_->rmsnorm_layers_.at(2 * num_layers)); + kernel::get_rmsnorm_dim_kernel(device_type_)( + working_input, final_rmsnorm->get_weight(0), unified_rms_output, dim, stream); + + // Get last token indices for each sequence (for logits computation) + std::vector last_token_indices = batch_meta.get_last_token_indices(); + + // Compute logits only for the last token of each sequence + // For efficiency, we extract last tokens and do a smaller GEMM + next_tokens.resize(num_seqs); + auto cls_layer = std::dynamic_pointer_cast(llama_layers_->cls_layer_); + const int32_t vocab_size = config_->vocab_size_; + + // Allocate temporary buffer for last token hidden states and logits + std::shared_ptr alloc; + if (device_type_ == base::DeviceType::kDeviceCUDA) { + alloc = base::CUDADeviceAllocatorFactory::get_instance(); + } else { + alloc = base::CPUDeviceAllocatorFactory::get_instance(); + } + + tensor::Tensor last_hidden(base::DataType::kDataTypeFp32, num_seqs, dim, true, alloc); + tensor::Tensor logits(base::DataType::kDataTypeFp32, num_seqs, vocab_size, true, alloc); + + // Copy last token hidden states + for (int32_t s = 0; s < num_seqs; ++s) { + int32_t last_idx = last_token_indices[s]; + cudaMemcpyAsync( + last_hidden.ptr(s * dim), + unified_rms_output.ptr(last_idx * dim), + dim * sizeof(float), + cudaMemcpyDeviceToDevice, + stream); + } + + // Compute logits: [num_seqs, dim] @ [vocab_size, dim]^T -> [num_seqs, vocab_size] + kernel::get_gemm_kernel(device_type_)(last_hidden, cls_layer->get_weight(0), logits, cuda_config_.get()); + + cudaStreamSynchronize(stream); + + // Sample next token for each sequence + const float* logits_ptr = logits.ptr(); + for (int32_t s = 0; s < num_seqs; ++s) { + const float* seq_logits = logits_ptr + s * vocab_size; + next_tokens[s] = static_cast(sampler_->sample(seq_logits, vocab_size, stream)); + } + + return base::error::Success(); +} + } // namespace model \ No newline at end of file diff --git a/kuiper/source/model/model.cpp b/kuiper/source/model/model.cpp index b20b709..34bcd6c 100644 --- a/kuiper/source/model/model.cpp +++ b/kuiper/source/model/model.cpp @@ -1,4 +1,5 @@ #include "model/model.h" +#include #include #include #include @@ -18,9 +19,9 @@ const std::string& Model::token_path() const { return token_path_; } const std::string& Model::model_path() const { return model_path_; } base::Status Model::insert_buffer(ModelBufferType buffer_idx, const tensor::Tensor& tensor) { - if (buffers_.count(buffer_idx) > 0) { - return base::error::KeyHasExits(std::to_string(int(buffer_idx)) + " has exits in the buffers"); - } + // if (buffers_.count(buffer_idx) > 0) { + // return base::error::KeyHasExits(std::to_string(int(buffer_idx)) + " has exits in the buffers"); + // } if (tensor.is_empty()) { return base::error::InvalidArgument("The tensor is empty for inserting buffer."); } @@ -212,8 +213,43 @@ std::string Model::decode(std::vector token_idxs) const { return this->encode_layer_->decode(token_idxs); } +base::Status Model::init_paged_kv_cache(int32_t num_seqs, int32_t block_size, + int32_t max_blocks, int32_t max_blocks_per_seq) { + CHECK(config_ != nullptr); + CHECK_GT(num_seqs, 0); + CHECK_GT(block_size, 0); + CHECK_GT(max_blocks, 0); + + // Store num_seqs for batch processing + num_seqs_ = num_seqs; + + if (max_blocks_per_seq <= 0) { + max_blocks_per_seq = (config_->seq_len_ + block_size - 1) / block_size; + } + + std::shared_ptr alloc_dev; + if (device_type_ == base::DeviceType::kDeviceCPU) { + alloc_dev = base::CPUDeviceAllocatorFactory::get_instance(); + } else { + alloc_dev = base::CUDADeviceAllocatorFactory::get_instance(); + } + std::shared_ptr alloc_cpu = base::CPUDeviceAllocatorFactory::get_instance(); + + if (!kv_cache_manager_) { + kv_cache_manager_ = std::make_unique(); + } + + return kv_cache_manager_->init_paged_token_major( + device_type_, alloc_dev, alloc_cpu, num_seqs, config_->layer_num_, config_->kv_head_num_, + config_->head_size_, block_size, max_blocks, max_blocks_per_seq, nullptr); +} + std::pair Model::slice_kv_cache(int32_t layer_idx, int32_t token_pos) const { + if (kv_cache_manager_) { + return kv_cache_manager_->slice_kv(0, layer_idx, token_pos); // 这里默认只有一个seq + } + int32_t layer_offset = layer_idx * config_->seq_len_ * config_->kv_dim_; int32_t cache_offset = layer_offset + token_pos * config_->kv_dim_; @@ -258,4 +294,244 @@ tensor::Tensor Model::fill_input(const tensor::Tensor& pos_tensor, return input; } +// ============================================================================ +// Unified forward interface implementation +// ============================================================================ + +base::Status Model::forward_unified( + const tensor::Tensor& hidden_states, + const base::BatchMetadata& batch_meta, + std::vector& next_tokens) const { + // Default implementation - subclasses should override + return base::error::FunctionNotImplement("forward_unified not implemented in base Model class"); +} + +void Model::init_unified_buffers(int32_t max_tokens) { + if (max_tokens <= max_unified_tokens_) { + return; // Already have enough capacity + } + + std::shared_ptr alloc; + if (device_type_ == base::DeviceType::kDeviceCPU) { + alloc = base::CPUDeviceAllocatorFactory::get_instance(); + } else { + alloc = base::CUDADeviceAllocatorFactory::get_instance(); + } + + const int32_t dim = config_->dim_; + const int32_t kv_dim = config_->kv_dim_; + const int32_t hidden_dim = config_->hidden_dim_; + + // Allocate unified buffers for [max_tokens, dim/kv_dim/hidden_dim] + tensor::Tensor unified_query(base::DataType::kDataTypeFp32, max_tokens, dim, true, alloc); + tensor::Tensor unified_key(base::DataType::kDataTypeFp32, max_tokens, kv_dim, true, alloc); + tensor::Tensor unified_value(base::DataType::kDataTypeFp32, max_tokens, kv_dim, true, alloc); + tensor::Tensor unified_rms_output(base::DataType::kDataTypeFp32, max_tokens, dim, true, alloc); + tensor::Tensor unified_mha_output(base::DataType::kDataTypeFp32, max_tokens, dim, true, alloc); + tensor::Tensor unified_attn_output(base::DataType::kDataTypeFp32, max_tokens, dim, true, alloc); + tensor::Tensor unified_ffn_rms(base::DataType::kDataTypeFp32, max_tokens, dim, true, alloc); + tensor::Tensor unified_w1_output(base::DataType::kDataTypeFp32, max_tokens, hidden_dim, true, alloc); + tensor::Tensor unified_w3_output(base::DataType::kDataTypeFp32, max_tokens, hidden_dim, true, alloc); + tensor::Tensor unified_w2_output(base::DataType::kDataTypeFp32, max_tokens, dim, true, alloc); + + buffers_[ModelBufferType::kUnifiedQuery] = unified_query; + buffers_[ModelBufferType::kUnifiedKey] = unified_key; + buffers_[ModelBufferType::kUnifiedValue] = unified_value; + buffers_[ModelBufferType::kUnifiedRMSNormOutput] = unified_rms_output; + buffers_[ModelBufferType::kUnifiedMHAOutput] = unified_mha_output; + buffers_[ModelBufferType::kUnifiedAttnOutput] = unified_attn_output; + buffers_[ModelBufferType::kUnifiedFFNRMSNorm] = unified_ffn_rms; + buffers_[ModelBufferType::kUnifiedW1Output] = unified_w1_output; + buffers_[ModelBufferType::kUnifiedW3Output] = unified_w3_output; + buffers_[ModelBufferType::kUnifiedW2Output] = unified_w2_output; + + max_unified_tokens_ = max_tokens; + + LOG(INFO) << "Initialized unified buffers for max " << max_tokens << " tokens"; +} + +base::Status Model::forward_decode( + const std::vector& seq_ids, + const std::vector& positions, + const tensor::Tensor& hidden_states, + std::vector& next_tokens) const { + CHECK(kv_cache_manager_ != nullptr) << "KV cache manager not initialized"; + + std::shared_ptr alloc_gpu; + std::shared_ptr alloc_cpu; + if (device_type_ == base::DeviceType::kDeviceCUDA) { + alloc_gpu = base::CUDADeviceAllocatorFactory::get_instance(); + } else { + alloc_gpu = base::CPUDeviceAllocatorFactory::get_instance(); + } + alloc_cpu = base::CPUDeviceAllocatorFactory::get_instance(); + + // Get context lengths from KV cache manager + std::vector context_lens(seq_ids.size()); + for (size_t i = 0; i < seq_ids.size(); ++i) { + context_lens[i] = kv_cache_manager_->get_context_len(seq_ids[i]); + } + + auto batch_meta = base::BatchMetadata::create_decode( + seq_ids, positions, context_lens, alloc_gpu, alloc_cpu, nullptr); + + return forward_unified(hidden_states, batch_meta, next_tokens); +} + +base::Status Model::forward_prefill( + int32_t seq_id, + const tensor::Tensor& hidden_states, + int32_t start_pos, + int32_t& next_token) const { + CHECK(kv_cache_manager_ != nullptr) << "KV cache manager not initialized"; + + std::shared_ptr alloc_gpu; + std::shared_ptr alloc_cpu; + if (device_type_ == base::DeviceType::kDeviceCUDA) { + alloc_gpu = base::CUDADeviceAllocatorFactory::get_instance(); + } else { + alloc_gpu = base::CPUDeviceAllocatorFactory::get_instance(); + } + alloc_cpu = base::CPUDeviceAllocatorFactory::get_instance(); + + // Calculate num_tokens from hidden_states shape + int32_t num_tokens = static_cast(hidden_states.size() / config_->dim_); + int32_t context_len = start_pos + num_tokens; + + auto batch_meta = base::BatchMetadata::create_prefill( + seq_id, start_pos, num_tokens, context_len, alloc_gpu, alloc_cpu, nullptr); + + std::vector next_tokens; + auto status = forward_unified(hidden_states, batch_meta, next_tokens); + if (!status) { + return status; + } + + if (!next_tokens.empty()) { + next_token = next_tokens[0]; + } + return base::error::Success(); +} + +int32_t Model::compute_chunk_size(int32_t total_tokens, int32_t num_active_seqs) const { + // Default chunk size limits + constexpr int32_t kMinChunkSize = 16; + constexpr int32_t kMaxChunkSize = 2048; + constexpr int32_t kDefaultChunkSize = 512; + + if (total_tokens <= kMinChunkSize) { + return total_tokens; + } + + // Estimate memory per token (rough approximation) + // Each token needs: Q, K, V, intermediate buffers + // Q: dim, K: kv_dim, V: kv_dim, MHA output: dim, FFN intermediates: 2*hidden_dim + size_t per_token_floats = config_->dim_ * 4 + // Q, attn_out, rms_out, w2_out + config_->kv_dim_ * 2 + // K, V + config_->hidden_dim_ * 2; // W1, W3 outputs + size_t per_token_bytes = per_token_floats * sizeof(float); + + // Try to get available GPU memory (simplified - in practice would use cudaMemGetInfo) + // For now, use a conservative estimate based on typical GPU memory + size_t available_memory = 2ULL * 1024 * 1024 * 1024; // Assume 2GB available for activations + + // Account for other sequences in the batch + size_t memory_for_prefill = available_memory / (num_active_seqs > 0 ? num_active_seqs : 1); + + int32_t max_tokens_by_memory = static_cast(memory_for_prefill / per_token_bytes); + + // Clamp to reasonable range + int32_t chunk_size = std::min({ + total_tokens, + max_tokens_by_memory, + kMaxChunkSize + }); + + chunk_size = std::max(chunk_size, kMinChunkSize); + + // Round down to multiple of 16 for better GPU efficiency + chunk_size = (chunk_size / 16) * 16; + if (chunk_size == 0) chunk_size = kMinChunkSize; + + return chunk_size; +} + +base::Status Model::forward_chunked_prefill( + int32_t seq_id, + const tensor::Tensor& hidden_states, + int32_t start_pos, + int32_t chunk_size, + int32_t& next_token) const { + CHECK(kv_cache_manager_ != nullptr) << "KV cache manager not initialized"; + + int32_t total_tokens = static_cast(hidden_states.size() / config_->dim_); + + if (total_tokens == 0) { + return base::error::InvalidArgument("Empty hidden_states for chunked prefill"); + } + + // Auto-compute chunk size if not specified + if (chunk_size <= 0) { + chunk_size = compute_chunk_size(total_tokens, 1); + } + + std::shared_ptr alloc_gpu; + std::shared_ptr alloc_cpu; + if (device_type_ == base::DeviceType::kDeviceCUDA) { + alloc_gpu = base::CUDADeviceAllocatorFactory::get_instance(); + } else { + alloc_gpu = base::CPUDeviceAllocatorFactory::get_instance(); + } + alloc_cpu = base::CPUDeviceAllocatorFactory::get_instance(); + + int32_t current_pos = start_pos; + int32_t tokens_processed = 0; + + while (tokens_processed < total_tokens) { + int32_t tokens_this_chunk = std::min(chunk_size, total_tokens - tokens_processed); + + // Create a view into hidden_states for this chunk + // hidden_states: [total_tokens, dim] + // chunk: [tokens_this_chunk, dim] + int32_t offset = tokens_processed * config_->dim_; + tensor::Tensor chunk_hidden(base::DataType::kDataTypeFp32, + tokens_this_chunk, config_->dim_, + false, nullptr, + const_cast(hidden_states.ptr(offset))); + chunk_hidden.set_device_type(device_type_); + + // Allocate blocks for this chunk + int32_t context_len_after_chunk = current_pos + tokens_this_chunk; + if (!kv_cache_manager_->allocate_blocks_for_tokens(seq_id, context_len_after_chunk)) { + return base::error::InternalError("Failed to allocate KV cache blocks for prefill chunk"); + } + + // Create batch metadata for this chunk + auto batch_meta = base::BatchMetadata::create_prefill( + seq_id, current_pos, tokens_this_chunk, context_len_after_chunk, + alloc_gpu, alloc_cpu, nullptr); + + // Forward this chunk + std::vector chunk_next_tokens; + auto status = forward_unified(chunk_hidden, batch_meta, chunk_next_tokens); + if (!status) { + return status; + } + + // Update position + current_pos += tokens_this_chunk; + tokens_processed += tokens_this_chunk; + + // Keep the last token prediction + if (!chunk_next_tokens.empty()) { + next_token = chunk_next_tokens[0]; + } + } + + // Update context length in KV cache manager + kv_cache_manager_->set_context_len(seq_id, current_pos); + + return base::error::Success(); +} + } // namespace model \ No newline at end of file diff --git a/kuiper/source/model/model_paged.cpp b/kuiper/source/model/model_paged.cpp new file mode 100644 index 0000000..b20b709 --- /dev/null +++ b/kuiper/source/model/model_paged.cpp @@ -0,0 +1,261 @@ +#include "model/model.h" +#include +#include +#include +namespace model { +Model::Model(base::TokenizerType tokenizer_type, base::ModelType model_type, std::string token_path, + std::string model_path, bool is_quant_model) + : tokenizer_type_(tokenizer_type), + model_type_(model_type), + token_path_(std::move(token_path)), + model_path_(std::move(model_path)), + is_quant_model_(is_quant_model) {} + +base::ModelType Model::model_type() const { return model_type_; } + +const std::string& Model::token_path() const { return token_path_; } + +const std::string& Model::model_path() const { return model_path_; } + +base::Status Model::insert_buffer(ModelBufferType buffer_idx, const tensor::Tensor& tensor) { + if (buffers_.count(buffer_idx) > 0) { + return base::error::KeyHasExits(std::to_string(int(buffer_idx)) + " has exits in the buffers"); + } + if (tensor.is_empty()) { + return base::error::InvalidArgument("The tensor is empty for inserting buffer."); + } + buffers_.insert({buffer_idx, tensor}); + return base::error::Success(); +} + +tensor::Tensor& Model::get_buffer(ModelBufferType buffer_idx) { + CHECK_GT(buffers_.count(buffer_idx), 0) << int(buffer_idx); + return buffers_.at(buffer_idx); +} + +const tensor::Tensor& Model::get_buffer(ModelBufferType buffer_idx) const { + CHECK_GT(buffers_.count(buffer_idx), 0); + return buffers_.at(buffer_idx); +} + +base::Status Model::read_model_file() { + using namespace base; + if (model_path_.empty()) { + return error::PathNotValid("Failed to open the weight file, the model path is empty!"); + } + int32_t fd = open(model_path_.data(), O_RDONLY); + if (fd == -1) { + return error::PathNotValid("Failed to open the weight file " + model_path_ + + " may be the path does not exist!"); + } + + FILE* file = fopen(model_path_.data(), "rb"); + if (!file) { + return error::PathNotValid("Failed to open the file. The path may be invalid."); + } + + auto config = ModelConfig{}; + if (fread(&config, sizeof(ModelConfig), 1, file) != 1) { + return error::ModelParseError( + "Failed to retrieve the configuration information from the model " + "file."); + } + if (is_quant_model_) { + if (fread(&group_size_, sizeof(int32_t), 1, file) != 1) { + return error::ModelParseError( + "Failed to retrieve the group size information from the model " + "file."); + } + } + + auto gen_status = generate_model_infos(config); + if (!gen_status) { + return gen_status; + } + + if (!is_quant_model_) { + raw_model_data_ = std::make_shared(); + } else { + raw_model_data_ = std::make_shared(); + } + + struct stat sb; + if (fstat(fd, &sb) == -1) { + close(fd); + return error::ModelParseError( + "Failed to retrieve the file size information from the model " + "file."); + } + raw_model_data_->file_size = sb.st_size; + + raw_model_data_->fd = fd; + raw_model_data_->data = + mmap(nullptr, raw_model_data_->file_size, PROT_READ, MAP_PRIVATE, raw_model_data_->fd, 0); + + if (raw_model_data_->data == MAP_FAILED || raw_model_data_->data == nullptr) { + return error::ModelParseError("Failed to map the weight file " + model_path_ + " into memory."); + } + if (!is_quant_model_) { + raw_model_data_->weight_data = + static_cast(raw_model_data_->data) + sizeof(ModelConfig); + } else { + raw_model_data_->weight_data = + static_cast(raw_model_data_->data) + sizeof(ModelConfig) + sizeof(group_size_); + } + if (raw_model_data_ == nullptr) { + LOG(ERROR); + return error::ModelParseError("Failed to map the weight file " + model_path_ + + " into memory, the pointer to weight start address is null"); + } + return error::Success(); +} + +base::Status Model::generate_model_infos(const ModelConfig& config) const { + config_->dim_ = config.dim; + config_->hidden_dim_ = config.hidden_dim; + config_->layer_num_ = config.layer_num; + config_->head_num_ = config.head_num; + config_->kv_head_num_ = config.kv_head_num; + config_->seq_len_ = config.seq_len; + + config_->kv_dim_ = (config.dim * config.kv_head_num) / config.head_num; + config_->kv_mul_ = config.head_num / config.kv_head_num; + config_->head_size_ = config.dim / config.head_num; +#if defined(QWEN3_SUPPORT) + config_->immediate_dim_ = config.immediate_dim_; +#endif + if (config.vocab_size > 0) { + config_->is_shared_weight_ = true; + } else { + config_->is_shared_weight_ = false; + } + + // Qwen tokenizer size and embedding size is mismatched + // refer: https://github.com/QwenLM/Qwen2.5/issues/29 + // if (std::abs(config.vocab_size) != config_->vocab_size_) { + // return base::error::ModelParseError( + // "Vocabulary size mismatch between the model file and the token list."); + // } + config_->vocab_size_ = std::abs(config.vocab_size); + return base::error::Success(); +} + +base::Status Model::create_encode_layer() { + using namespace base; + + // create token encode decode layer + if (tokenizer_type_ == TokenizerType::kEncodeSpe) { + encode_layer_ = std::make_unique(this->token_path_, true, false); + } else { +#ifdef LLAMA3_SUPPORT + encode_layer_ = std::make_unique(this->token_path_, true, false); +#endif + +#if defined(QWEN2_SUPPORT) || defined(QWEN3_SUPPORT) + encode_layer_ = std::make_unique(this->token_path_, false, false); +#endif + } + if (!encode_layer_) { + return error::InternalError("Create the encode layer failed."); + } + + config_->vocab_size_ = encode_layer_->vocab_size(); + if (config_->vocab_size_ <= 0) { + return error::InternalError("The vocab size param read error from the model file!"); + } + return error::Success(); +} + +base::Status Model::gen_model_from_file() { + using namespace base; + config_ = std::make_unique(); + + // init sentence piece processor + // google sentence piece + auto create_encode_status = create_encode_layer(); + if (!create_encode_status) { + LOG(ERROR) << "Create the encode layer failed!"; + return create_encode_status; + } + // mmap + auto mmap_status = read_model_file(); + if (!mmap_status) { + LOG(ERROR) << "Handle model file " << model_path_ << " failed!"; + return mmap_status; + } + auto layer_create_status = create_layers(); + if (!layer_create_status) { + LOG(ERROR) << "Create layers for the model file " << model_path_ << " failed!"; + return layer_create_status; + } + + return error::Success(); +} + +std::vector Model::encode(const std::string& sentence) const { + CHECK(encode_layer_ != nullptr); + return encode_layer_->encode(sentence); +} + +bool Model::is_sentence_ending(int32_t token_idx) const { + CHECK(this->encode_layer_ != nullptr); + return this->encode_layer_->is_sentence_ending(token_idx); +} + +std::string Model::decode(int32_t token_idx) const { + CHECK(this->encode_layer_ != nullptr); + return this->encode_layer_->decode(token_idx); +} + +std::string Model::decode(std::vector token_idxs) const { + CHECK(this->encode_layer_ != nullptr); + return this->encode_layer_->decode(token_idxs); +} + +std::pair Model::slice_kv_cache(int32_t layer_idx, + int32_t token_pos) const { + int32_t layer_offset = layer_idx * config_->seq_len_ * config_->kv_dim_; + int32_t cache_offset = layer_offset + token_pos * config_->kv_dim_; + + float* key_cache_ptr = + const_cast(get_buffer(ModelBufferType::kKeyCache).ptr(cache_offset)); + float* val_cache_ptr = + const_cast(get_buffer(ModelBufferType::kValueCache).ptr(cache_offset)); + + tensor::Tensor key(base::DataType::kDataTypeFp32, config_->kv_dim_, false, nullptr, + key_cache_ptr); + tensor::Tensor val(base::DataType::kDataTypeFp32, config_->kv_dim_, false, nullptr, + val_cache_ptr); + key.set_device_type(device_type_); + val.set_device_type(device_type_); + return {key, val}; +} + +tensor::Tensor Model::fill_input(const tensor::Tensor& pos_tensor, + const op::EmbeddingOutput& embedding_output, + bool is_prompt) const { + const int32_t pos = pos_tensor.index(0); + auto [input_tokens, input_embeddings, input_token_num] = embedding_output; + + int32_t index = 0; + if (is_prompt) { + index = pos; + } +#if defined(QWEN3_SUPPORT) + std::shared_ptr input_emb_buffer = std::make_shared( + config_->hidden_dim_ * sizeof(float), nullptr, + input_embeddings.ptr(index * config_->hidden_dim_), true); + tensor::Tensor input(base::DataType::kDataTypeFp32, config_->hidden_dim_); + +#else + std::shared_ptr input_emb_buffer = + std::make_shared(config_->dim_ * sizeof(float), nullptr, + input_embeddings.ptr(index * config_->dim_), true); + tensor::Tensor input(base::DataType::kDataTypeFp32, config_->dim_); +#endif + input.assign(input_emb_buffer); + input.set_device_type(device_type_); + return input; +} + +} // namespace model \ No newline at end of file diff --git a/kuiper/source/op/kernels/cuda/gemm_kernel.cu b/kuiper/source/op/kernels/cuda/gemm_kernel.cu new file mode 100644 index 0000000..b78f11b --- /dev/null +++ b/kuiper/source/op/kernels/cuda/gemm_kernel.cu @@ -0,0 +1,322 @@ +#include +#include +#include "gemm_kernel.cuh" +#include "matmul_kernel.cuh" + +namespace kernel { + +// ============================================================================ +// Tiled GEMM Kernel Configuration +// ============================================================================ +// Block tile size: each thread block computes BLOCK_M x BLOCK_N output elements +// Thread tile size: each thread computes THREAD_M x THREAD_N output elements +// K dimension is processed in chunks of BLOCK_K + +constexpr int BLOCK_M = 64; // Block tile size in M dimension +constexpr int BLOCK_N = 64; // Block tile size in N dimension +constexpr int BLOCK_K = 32; // Block tile size in K dimension +constexpr int THREAD_M = 4; // Thread tile size in M dimension +constexpr int THREAD_N = 4; // Thread tile size in N dimension + +// Derived constants +constexpr int THREADS_M = BLOCK_M / THREAD_M; // 16 threads in M dimension +constexpr int THREADS_N = BLOCK_N / THREAD_N; // 16 threads in N dimension +constexpr int THREADS_PER_BLOCK = THREADS_M * THREADS_N; // 256 threads total + +// ============================================================================ +// Tiled GEMM Kernel +// ============================================================================ +// Computes C[M,N] = A[M,K] x B[N,K]^T where B is stored as [N,K] +// +// Memory Layout: +// A: row-major [M, K] +// B: row-major [N, K] (logically transposed, so B[n,k] accesses weight[n*K+k]) +// C: row-major [M, N] +// +// Optimization techniques: +// 1. Shared memory tiling to reduce global memory access +// 2. Thread-level tiling for register reuse +// 3. Vectorized loads (float4) where possible +// 4. Loop unrolling for better instruction-level parallelism + +template +__global__ void gemm_tiled_kernel( + const float* __restrict__ A, // [M, K] + const float* __restrict__ B, // [N, K] - weight matrix (transposed storage) + float* __restrict__ C, // [M, N] + const int M, const int K, const int N) { + + // Block and thread indices + const int bx = blockIdx.x; // Block index in N dimension + const int by = blockIdx.y; // Block index in M dimension + const int tx = threadIdx.x; // Thread index in N dimension (0 to THREADS_N-1) + const int ty = threadIdx.y; // Thread index in M dimension (0 to THREADS_M-1) + + // Starting position of this block's output tile in C + const int c_row_start = by * BM; + const int c_col_start = bx * BN; + + // Shared memory for A and B tiles + __shared__ float As[BM][BK]; + __shared__ float Bs[BK][BN]; + + // Register accumulator for this thread's output tile (TM x TN elements) + float c_reg[TM][TN] = {{0.0f}}; + + // This thread's position within the block's output tile + const int thread_row = ty * TM; // Starting row in block tile + const int thread_col = tx * TN; // Starting col in block tile + + // Linear thread ID for cooperative loading + const int tid = ty * (BN / TN) + tx; + constexpr int NUM_THREADS = (BM / TM) * (BN / TN); + + // Calculate how many elements each thread loads for A and B tiles + constexpr int A_TILE_ELEMENTS = BM * BK; + constexpr int B_TILE_ELEMENTS = BK * BN; + constexpr int A_LOADS_PER_THREAD = (A_TILE_ELEMENTS + NUM_THREADS - 1) / NUM_THREADS; + constexpr int B_LOADS_PER_THREAD = (B_TILE_ELEMENTS + NUM_THREADS - 1) / NUM_THREADS; + + // Iterate over K dimension in chunks of BK + for (int k_start = 0; k_start < K; k_start += BK) { + + // ========== Load A tile into shared memory ========== + // A tile: [BM, BK] from A[c_row_start:c_row_start+BM, k_start:k_start+BK] + #pragma unroll + for (int load_idx = 0; load_idx < A_LOADS_PER_THREAD; ++load_idx) { + int elem_idx = tid + load_idx * NUM_THREADS; + if (elem_idx < A_TILE_ELEMENTS) { + int a_tile_row = elem_idx / BK; + int a_tile_col = elem_idx % BK; + int global_row = c_row_start + a_tile_row; + int global_col = k_start + a_tile_col; + + if (global_row < M && global_col < K) { + As[a_tile_row][a_tile_col] = A[global_row * K + global_col]; + } else { + As[a_tile_row][a_tile_col] = 0.0f; + } + } + } + + // ========== Load B tile into shared memory ========== + // B is stored as [N, K], we load B[c_col_start:c_col_start+BN, k_start:k_start+BK] + // and store it transposed as Bs[BK, BN] for coalesced access during computation + #pragma unroll + for (int load_idx = 0; load_idx < B_LOADS_PER_THREAD; ++load_idx) { + int elem_idx = tid + load_idx * NUM_THREADS; + if (elem_idx < B_TILE_ELEMENTS) { + int b_tile_row = elem_idx / BN; // K dimension (0 to BK-1) + int b_tile_col = elem_idx % BN; // N dimension (0 to BN-1) + int global_k = k_start + b_tile_row; + int global_n = c_col_start + b_tile_col; + + if (global_k < K && global_n < N) { + // B[N, K] -> Bs[K, N]: B[global_n, global_k] = B[global_n * K + global_k] + Bs[b_tile_row][b_tile_col] = B[global_n * K + global_k]; + } else { + Bs[b_tile_row][b_tile_col] = 0.0f; + } + } + } + + __syncthreads(); + + // ========== Compute partial results ========== + // Each thread computes its TM x TN output elements + #pragma unroll + for (int k = 0; k < BK; ++k) { + // Load A values for this thread's rows into registers + float a_reg[TM]; + #pragma unroll + for (int m = 0; m < TM; ++m) { + a_reg[m] = As[thread_row + m][k]; + } + + // Load B values for this thread's columns into registers + float b_reg[TN]; + #pragma unroll + for (int n = 0; n < TN; ++n) { + b_reg[n] = Bs[k][thread_col + n]; + } + + // Outer product accumulation + #pragma unroll + for (int m = 0; m < TM; ++m) { + #pragma unroll + for (int n = 0; n < TN; ++n) { + c_reg[m][n] += a_reg[m] * b_reg[n]; + } + } + } + + __syncthreads(); + } + + // ========== Write results to global memory ========== + #pragma unroll + for (int m = 0; m < TM; ++m) { + #pragma unroll + for (int n = 0; n < TN; ++n) { + int global_row = c_row_start + thread_row + m; + int global_col = c_col_start + thread_col + n; + + if (global_row < M && global_col < N) { + C[global_row * N + global_col] = c_reg[m][n]; + } + } + } +} + +// ============================================================================ +// Batched GEMV Kernel for Decode Phase +// ============================================================================ +// For decode phase: each sequence has 1 token, but we have multiple sequences +// Input: [batch_size, in_dim], Weight: [out_dim, in_dim], Output: [batch_size, out_dim] + +template +__global__ void gemv_batched_kernel( + const float* __restrict__ input, // [batch_size, in_dim] + const float* __restrict__ weight, // [out_dim, in_dim] + float* __restrict__ output, // [batch_size, out_dim] + const int batch_size, + const int in_dim, + const int out_dim) { + + // Grid: (out_dim, batch_size) - each block computes one output element for one sequence + const int out_idx = blockIdx.x; + const int batch_idx = blockIdx.y; + const int tid = threadIdx.x; + + if (batch_idx >= batch_size || out_idx >= out_dim) return; + + // Pointers to this sequence's input and weight row + const float* x = input + batch_idx * in_dim; + const float* w = weight + out_idx * in_dim; + + // Shared memory for partial sums + __shared__ float sdata[THREADS]; + float sum = 0.0f; + + // Vectorized load using float4 for better memory bandwidth + const int vec_size = 4; + const int vec_num = in_dim / vec_size; + const float4* x_vec = reinterpret_cast(x); + const float4* w_vec = reinterpret_cast(w); + + // Process vectorized portion + for (int i = tid; i < vec_num; i += THREADS) { + float4 xv = x_vec[i]; + float4 wv = w_vec[i]; + sum += xv.x * wv.x + xv.y * wv.y + xv.z * wv.z + xv.w * wv.w; + } + + // Process remaining elements + const int vec_offset = vec_num * vec_size; + for (int i = vec_offset + tid; i < in_dim; i += THREADS) { + sum += x[i] * w[i]; + } + + sdata[tid] = sum; + __syncthreads(); + + // Block reduction + for (int s = THREADS / 2; s > 0; s >>= 1) { + if (tid < s) { + sdata[tid] += sdata[tid + s]; + } + __syncthreads(); + } + + // Write result + if (tid == 0) { + output[batch_idx * out_dim + out_idx] = sdata[0]; + } +} + +// ============================================================================ +// Public Interface Functions +// ============================================================================ + +void gemm_kernel_cu(const tensor::Tensor& input, const tensor::Tensor& weight, + const tensor::Tensor& output, const CudaConfig* config) { + CHECK(!input.is_empty()) << "Input tensor is empty"; + CHECK(!weight.is_empty()) << "Weight tensor is empty"; + CHECK(input.device_type() == base::DeviceType::kDeviceCUDA) << "Input must be on CUDA"; + CHECK(weight.device_type() == base::DeviceType::kDeviceCUDA) << "Weight must be on CUDA"; + + // Determine dimensions + // Input can be [num_tokens, in_dim] or [in_dim] for single token + int M, K, N; + if (input.dims_size() == 1) { + // Single token case: [in_dim] + M = 1; + K = input.get_dim(0); + } else { + // Multi-token case: [num_tokens, in_dim] + M = input.get_dim(0); + K = input.get_dim(1); + } + + // Weight is [out_dim, in_dim] + N = weight.get_dim(0); + CHECK_EQ(weight.get_dim(1), K) << "Weight dimension mismatch"; + + // For single token with 1D input, fall back to existing optimized GEMV + // Note: only fallback when input is 1D [dim], not 2D [1, dim] + if (M == 1 && input.dims_size() == 1) { + matmul_kernel_cu(input, weight, output, 1.0f, config); + return; + } + + // Multi-token: use tiled GEMM + dim3 block(THREADS_N, THREADS_M); // (16, 16) = 256 threads + dim3 grid((N + BLOCK_N - 1) / BLOCK_N, (M + BLOCK_M - 1) / BLOCK_M); + + cudaStream_t stream = config ? config->stream : nullptr; + + gemm_tiled_kernel + <<>>( + input.ptr(), + weight.ptr(), + const_cast(output.ptr()), + M, K, N); +} + +void gemv_batched_kernel_cu(const tensor::Tensor& input, const tensor::Tensor& weight, + const tensor::Tensor& output, const CudaConfig* config) { + CHECK(!input.is_empty()) << "Input tensor is empty"; + CHECK(!weight.is_empty()) << "Weight tensor is empty"; + CHECK(input.device_type() == base::DeviceType::kDeviceCUDA) << "Input must be on CUDA"; + CHECK(weight.device_type() == base::DeviceType::kDeviceCUDA) << "Weight must be on CUDA"; + CHECK_EQ(input.dims_size(), 2) << "Input must be 2D [batch_size, in_dim]"; + + const int batch_size = input.get_dim(0); + const int in_dim = input.get_dim(1); + const int out_dim = weight.get_dim(0); + + CHECK_EQ(weight.get_dim(1), in_dim) << "Weight dimension mismatch"; + + // For small batch sizes, use custom batched GEMV kernel + // For larger batches, GEMM might be more efficient + constexpr int BATCH_THRESHOLD = 16; + + if (batch_size <= BATCH_THRESHOLD) { + constexpr int THREADS = 256; + dim3 grid(out_dim, batch_size); + dim3 block(THREADS); + + cudaStream_t stream = config ? config->stream : nullptr; + + gemv_batched_kernel<<>>( + input.ptr(), + weight.ptr(), + const_cast(output.ptr()), + batch_size, in_dim, out_dim); + } else { + // Fall back to GEMM for larger batches + gemm_kernel_cu(input, weight, output, config); + } +} + +} // namespace kernel diff --git a/kuiper/source/op/kernels/cuda/gemm_kernel.cuh b/kuiper/source/op/kernels/cuda/gemm_kernel.cuh new file mode 100644 index 0000000..44c0a93 --- /dev/null +++ b/kuiper/source/op/kernels/cuda/gemm_kernel.cuh @@ -0,0 +1,42 @@ +#ifndef GEMM_KERNEL_CU_CUH +#define GEMM_KERNEL_CU_CUH +#include "../kernels_interface.h" +#include "tensor/tensor.h" + +namespace kernel { + +/** + * Tiled GEMM kernel for multi-token matrix multiplication + * + * Computes: C[M, N] = A[M, K] × B[K, N]^T + * Where B is stored as [N, K] (row-major, transposed) + * + * For LLM inference: + * A = Input [num_tokens, in_dim] + * B = Weight [out_dim, in_dim] (stored transposed) + * C = Output [num_tokens, out_dim] + * + * @param input Input tensor [num_tokens, in_dim] or [in_dim] for single token + * @param weight Weight tensor [out_dim, in_dim] + * @param output Output tensor [num_tokens, out_dim] or [out_dim] for single token + * @param config CUDA configuration (stream, etc.) + */ +void gemm_kernel_cu(const tensor::Tensor& input, const tensor::Tensor& weight, + const tensor::Tensor& output, const CudaConfig* config = nullptr); + +/** + * Batched GEMV kernel for multi-sequence decode phase + * Each sequence has exactly 1 token + * + * @param input Input tensor [batch_size, in_dim] + * @param weight Weight tensor [out_dim, in_dim] + * @param output Output tensor [batch_size, out_dim] + * @param config CUDA configuration + */ +void gemv_batched_kernel_cu(const tensor::Tensor& input, const tensor::Tensor& weight, + const tensor::Tensor& output, const CudaConfig* config = nullptr); + + +} // namespace kernel + +#endif // GEMM_KERNEL_CU_CUH diff --git a/kuiper/source/op/kernels/cuda/paged_attention_kernel.cu b/kuiper/source/op/kernels/cuda/paged_attention_kernel.cu new file mode 100644 index 0000000..d51941c --- /dev/null +++ b/kuiper/source/op/kernels/cuda/paged_attention_kernel.cu @@ -0,0 +1,628 @@ +/** + * @author jintang-coder + * - PagedAttention: GPU kernels for paged KV cache attention + * + * ============================================================================= + * PagedAttention CUDA Kernels + * ============================================================================= + * + * 功能: 在分页KV Cache上执行注意力计算,支持Decode/Prefill/Mixed批次 + * + * KV Cache 内存布局: + * kv_cache[Total_Blocks, Num_Layers, 2(K/V), Block_Size, KV_Dim] + * + * 访问某个token的Key: + * physical_block = block_tables[seq_idx][logical_block] + * offset = physical_block * block_stride + * + layer_idx * layer_stride + * + 0 * kv_plane_stride // K plane + * + token_in_block * kv_dim + * + * --- + * Kernel 1: paged_attention_kernel (Decode专用) + * + * Grid: (num_seqs, num_heads) + * Block: 128 threads + * + * 每个block处理一个(seq, head)对: + * 1. 加载query[seq][head] + * 2. 遍历该seq的所有KV blocks + * 3. 计算attention scores (online softmax) + * 4. 输出加权value + * + * 例子: seq0有50个tokens,block_size=16 + * num_logical_blocks = ceil(50/16) = 4 + * 遍历 block_tables[0][0..3] 获取物理块ID + * 最后一个块只有 50%16=2 个有效token + * + * --- + * Kernel 2: batch_kv_cache_write_kernel (Decode写入) + * + * Grid: (num_seqs) + * Block: 256 threads + * + * 每个block写入一个序列的新KV: + * seq_idx = blockIdx.x + * token_pos = positions[seq_idx] + * logical_block = token_pos / block_size + * physical_block = block_tables[seq_idx][logical_block] + * 写入 kv_cache[physical_block][layer][K/V][token_in_block] + * + * --- + * Kernel 3: batch_kv_cache_write_kernel_v2 (Prefill/Mixed写入) + * + * Grid: (num_tokens) // 注意: 按token而非seq + * Block: 256 threads + * + * 关键区别: 使用 seq_ids[] 映射token到序列 + * + * 例子: Mixed batch + * all_tokens = [t0..t9, t10, t11..t15] // 16 tokens + * seq_ids = [0,0,0,0,0,0,0,0,0,0, 1, 2,2,2,2,2] + * positions = [0,1,2,3,4,5,6,7,8,9, 25, 0,1,2,3,4] + * + * token_idx=10 (seq1的decode token): + * seq_idx = seq_ids[10] = 1 + * token_pos = positions[10] = 25 + * logical_block = 25/16 = 1 + * physical_block = block_tables[1][1] + * token_in_block = 25%16 = 9 + * + * --- + * Kernel 4: paged_attention_prefill_kernel (Prefill/Mixed注意力) + * + * Grid: (num_tokens, num_heads) // 每个query token一个block + * Block: 128 threads + * + * 关键: Causal Mask + * 每个query只attend到 position <= 自己的位置 + * + * 例子: Prefill seq0, positions=[0,1,2,3,4] + * query at pos=0: attend to [0] + * query at pos=1: attend to [0,1] + * query at pos=2: attend to [0,1,2] + * ... + * + * 实现: + * effective_context_len = min(context_lens[seq_idx], query_pos + 1) + * + * --- + * Online Softmax (FlashAttention风格): + * + * 避免存储完整attention矩阵,逐token更新: + * + * m_prev = -INF, d_prev = 0, acc_out = 0 + * for each key_token: + * score = dot(q, k) * scale + * m_new = max(m_prev, score) + * exp_score = exp(score - m_new) + * correction = exp(m_prev - m_new) + * d_prev = d_prev * correction + exp_score + * acc_out = acc_out * correction + v * exp_score + * m_prev = m_new + * output = acc_out / d_prev + * + * --- + * 数据流示意: + * + * BatchMetadata GPU Kernels + * ┌─────────────────┐ ┌─────────────────────────────┐ + * │ seq_ids[16] │─────────────►│ batch_kv_cache_write_v2 │ + * │ positions[16] │ │ 写入KV到正确的物理块位置 │ + * └─────────────────┘ └─────────────────────────────┘ + * │ + * KVCacheManager ▼ + * ┌─────────────────┐ ┌─────────────────────────────┐ + * │ block_tables │─────────────►│ paged_attention_prefill │ + * │ context_lens │ │ 从物理块读取KV计算attention│ + * │ kv_cache │ └─────────────────────────────┘ + * └─────────────────┘ │ + * ▼ + * ┌─────────────────────────────┐ + * │ Output: [num_tokens, dim] │ + * └─────────────────────────────┘ + * + */ + +#include +#include +#include +#include +#include "paged_attention_kernel.cuh" +#include + +namespace kernel { + // kernel 实现 的(v1 , 简化) + +__global__ void paged_attention_kernel_kernel( + float* __restrict__ out , // [num_seqs , num_heads , head_dim] + const float* __restrict__ query , //[num_seqs][num_heads][head_dim] + const float* __restrict__ key_value_cache , // [Total_Blocks, Num_Layers, 2, Block_Size, KV_Dim] (KV_Dim = num_kv_heads * head_dim) + const int* __restrict__ block_tables , + const int* __restrict__ context_lens , + const int max_context_len, + const int num_seqs, + const int num_heads, + const int num_kv_heads, + const int head_dim, + const int block_size, + const int max_blocks_per_seq, + const float scale, + const int layer_idx , + const int num_layers + +) { + int seq_idx = blockIdx.x; + int head_idx = blockIdx.y; + + int tid = threadIdx.x; // 对应 head_dim 中的一个元素 + if(seq_idx >= num_seqs || head_idx >= num_heads) return ; + + const long long token_stride = (long long)num_kv_heads * head_dim; + const long long kv_plane_stride = (long long)block_size * token_stride; + const long long layer_stride = 2LL * kv_plane_stride; + const long long block_stride = (long long)num_layers * layer_stride; + const long long kv_head_idx = head_idx / (num_heads / num_kv_heads); + const long long layer_offset = layer_stride * layer_idx; + + float q_val = 0.0f; + if (tid < head_dim) { + q_val = query[(long long)seq_idx * num_heads * head_dim + (long long)head_idx * head_dim + tid]; + } + + using BlockReduce = cub::BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + __shared__ float shared_score; + float m_prev = -FLT_MAX; + float d_prev = 0.0f ; + float acc_out = 0.0f ; + int context_len = context_lens[seq_idx]; + int num_logical_blocks = (context_len - 1) / block_size + 1; + for (int i = 0 ; i>>( + out.ptr(), query.ptr(), key_value_cache.ptr(), + block_tables.ptr(), context_lens.ptr(), max_context_len, num_seqs, num_heads, + num_kv_heads, head_dim, block_size, max_blocks_per_seq, scale, layer_idx, num_layers); +} + +// Batch KV cache write kernel +// Each thread block handles one sequence, threads handle kv_dim elements +__global__ void batch_kv_cache_write_kernel( + float* __restrict__ key_value_cache, // [Total_Blocks, Num_Layers, 2, Block_Size, KV_Dim] + const float* __restrict__ key, // [num_seqs, kv_dim] + const float* __restrict__ value, // [num_seqs, kv_dim] + const int* __restrict__ block_tables, // [num_seqs, max_blocks_per_seq] + const int* __restrict__ positions, // [num_seqs] + const int num_seqs, + const int kv_dim, + const int block_size, + const int max_blocks_per_seq, + const int layer_idx, + const int num_layers +) { + int seq_idx = blockIdx.x; + int tid = threadIdx.x; + + if (seq_idx >= num_seqs) return; + + // Get token position for this sequence + int token_pos = positions[seq_idx]; + + // Calculate which block and position within block + int logical_block = token_pos / block_size; + int token_in_block = token_pos % block_size; + + // Get physical block ID from block table + int physical_block_id = block_tables[seq_idx * max_blocks_per_seq + logical_block]; + + // Calculate strides for KV cache layout: [Total_Blocks, Num_Layers, 2, Block_Size, KV_Dim] + const long long token_stride = (long long)kv_dim; + const long long kv_plane_stride = (long long)block_size * token_stride; + const long long layer_stride = 2LL * kv_plane_stride; + const long long block_stride = (long long)num_layers * layer_stride; + + // Base offset for this block, layer, and token position + const long long base_offset = (long long)physical_block_id * block_stride + + (long long)layer_idx * layer_stride + + (long long)token_in_block * token_stride; + + // Key offset (kv_plane index 0) + const long long key_offset = base_offset; + // Value offset (kv_plane index 1) + const long long value_offset = base_offset + kv_plane_stride; + + // Input offset for this sequence + const long long input_offset = (long long)seq_idx * kv_dim; + + // Copy key and value elements (each thread handles multiple elements if kv_dim > blockDim.x) + for (int i = tid; i < kv_dim; i += blockDim.x) { + key_value_cache[key_offset + i] = key[input_offset + i]; + key_value_cache[value_offset + i] = value[input_offset + i]; + } +} + +void batch_kv_cache_write_cu( + tensor::Tensor& key_value_cache, + const tensor::Tensor& key, + const tensor::Tensor& value, + const tensor::Tensor& block_tables, + const tensor::Tensor& positions, + int num_seqs, + int num_kv_heads, + int head_dim, + int block_size, + int max_blocks_per_seq, + int layer_idx, + int num_layers, + cudaStream_t stream +) { + CHECK_EQ(key_value_cache.is_empty(), false); + CHECK_EQ(key.is_empty(), false); + CHECK_EQ(value.is_empty(), false); + CHECK_EQ(block_tables.is_empty(), false); + CHECK_EQ(positions.is_empty(), false); + + int kv_dim = num_kv_heads * head_dim; + + // Launch one block per sequence, 256 threads per block + dim3 grid(num_seqs); + dim3 block(256); + + batch_kv_cache_write_kernel<<>>( + key_value_cache.ptr(), + key.ptr(), + value.ptr(), + block_tables.ptr(), + positions.ptr(), + num_seqs, + kv_dim, + block_size, + max_blocks_per_seq, + layer_idx, + num_layers + ); +} + +// ============================================================================ +// Batch KV cache write kernel v2 (unified version) +// Supports chunked prefill and continuous batching +// Uses seq_ids to map tokens to sequences +// ============================================================================ +__global__ void batch_kv_cache_write_kernel_v2( + float* __restrict__ key_value_cache, // [Total_Blocks, Num_Layers, 2, Block_Size, KV_Dim] + const float* __restrict__ key, // [num_tokens, kv_dim] + const float* __restrict__ value, // [num_tokens, kv_dim] + const int* __restrict__ block_tables, // [num_seqs, max_blocks_per_seq] + const int* __restrict__ positions, // [num_tokens] + const int* __restrict__ seq_ids, // [num_tokens] - sequence index for each token + const int num_tokens, + const int kv_dim, + const int block_size, + const int max_blocks_per_seq, + const int layer_idx, + const int num_layers +) { + int token_idx = blockIdx.x; + int tid = threadIdx.x; + + if (token_idx >= num_tokens) return; + + // Get sequence index from seq_ids (key difference from v1) + int seq_idx = seq_ids[token_idx]; + int token_pos = positions[token_idx]; + + // Calculate which block and position within block + int logical_block = token_pos / block_size; + int token_in_block = token_pos % block_size; + + // Get physical block ID from block table + int physical_block_id = block_tables[seq_idx * max_blocks_per_seq + logical_block]; + + // Calculate strides for KV cache layout: [Total_Blocks, Num_Layers, 2, Block_Size, KV_Dim] + const long long token_stride = (long long)kv_dim; + const long long kv_plane_stride = (long long)block_size * token_stride; + const long long layer_stride = 2LL * kv_plane_stride; + const long long block_stride = (long long)num_layers * layer_stride; + + // Base offset for this block, layer, and token position + const long long base_offset = (long long)physical_block_id * block_stride + + (long long)layer_idx * layer_stride + + (long long)token_in_block * token_stride; + + // Key offset (kv_plane index 0) + const long long key_offset = base_offset; + // Value offset (kv_plane index 1) + const long long value_offset = base_offset + kv_plane_stride; + + // Input offset for this token + const long long input_offset = (long long)token_idx * kv_dim; + + // Copy key and value elements + for (int i = tid; i < kv_dim; i += blockDim.x) { + key_value_cache[key_offset + i] = key[input_offset + i]; + key_value_cache[value_offset + i] = value[input_offset + i]; + } +} + +void batch_kv_cache_write_v2_cu( + tensor::Tensor& key_value_cache, + const tensor::Tensor& key, + const tensor::Tensor& value, + const tensor::Tensor& block_tables, + const tensor::Tensor& positions, + const tensor::Tensor& seq_ids, + int num_tokens, + int num_kv_heads, + int head_dim, + int block_size, + int max_blocks_per_seq, + int layer_idx, + int num_layers, + cudaStream_t stream +) { + CHECK_EQ(key_value_cache.is_empty(), false); + CHECK_EQ(key.is_empty(), false); + CHECK_EQ(value.is_empty(), false); + CHECK_EQ(block_tables.is_empty(), false); + CHECK_EQ(positions.is_empty(), false); + CHECK_EQ(seq_ids.is_empty(), false); + + int kv_dim = num_kv_heads * head_dim; + + // Launch one block per token, 256 threads per block + dim3 grid(num_tokens); + dim3 block(256); + + batch_kv_cache_write_kernel_v2<<>>( + key_value_cache.ptr(), + key.ptr(), + value.ptr(), + block_tables.ptr(), + positions.ptr(), + seq_ids.ptr(), + num_tokens, + kv_dim, + block_size, + max_blocks_per_seq, + layer_idx, + num_layers + ); +} + +// ============================================================================ +// Paged attention prefill kernel +// Supports multiple query tokens per sequence with causal mask +// Each query only attends to positions <= its own position +// ============================================================================ +__global__ void paged_attention_prefill_kernel_impl( + float* __restrict__ out, // [num_tokens, num_heads, head_dim] + const float* __restrict__ query, // [num_tokens, num_heads, head_dim] + const float* __restrict__ key_value_cache, // [Total_Blocks, Num_Layers, 2, Block_Size, KV_Dim] + const int* __restrict__ block_tables, // [num_seqs, max_blocks_per_seq] + const int* __restrict__ context_lens, // [num_seqs] + const int* __restrict__ seq_ids, // [num_tokens] + const int* __restrict__ positions, // [num_tokens] - for causal mask + const int num_tokens, + const int num_seqs, + const int num_heads, + const int num_kv_heads, + const int head_dim, + const int block_size, + const int max_blocks_per_seq, + const float scale, + const int layer_idx, + const int num_layers +) { + // Grid: (num_tokens, num_heads) + int token_idx = blockIdx.x; + int head_idx = blockIdx.y; + int tid = threadIdx.x; + + if (token_idx >= num_tokens || head_idx >= num_heads) return; + + int seq_idx = seq_ids[token_idx]; + int query_pos = positions[token_idx]; // Current query's position + + // Causal mask: only attend to positions <= query_pos + // Also limited by context_lens[seq_idx] + int context_len = context_lens[seq_idx]; + int effective_context_len = min(context_len, query_pos + 1); + + if (effective_context_len <= 0) { + // No keys to attend to, output zeros + if (tid < head_dim) { + out[(long long)token_idx * num_heads * head_dim + (long long)head_idx * head_dim + tid] = 0.0f; + } + return; + } + + // Calculate strides + const long long token_stride = (long long)num_kv_heads * head_dim; + const long long kv_plane_stride = (long long)block_size * token_stride; + const long long layer_stride = 2LL * kv_plane_stride; + const long long block_stride = (long long)num_layers * layer_stride; + const long long kv_head_idx = head_idx / (num_heads / num_kv_heads); + const long long layer_offset = layer_stride * layer_idx; + + // Load query value + float q_val = 0.0f; + if (tid < head_dim) { + q_val = query[(long long)token_idx * num_heads * head_dim + (long long)head_idx * head_dim + tid]; + } + + // FlashAttention-style online softmax + using BlockReduce = cub::BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + __shared__ float shared_score; + + float m_prev = -FLT_MAX; + float d_prev = 0.0f; + float acc_out = 0.0f; + + int num_logical_blocks = (effective_context_len - 1) / block_size + 1; + + for (int i = 0; i < num_logical_blocks; i++) { + int physical_block_id = block_tables[seq_idx * max_blocks_per_seq + i]; + int valid_tokens = block_size; + if (i == num_logical_blocks - 1) { + valid_tokens = effective_context_len % block_size; + if (!valid_tokens) valid_tokens = block_size; + } + + for (int t = 0; t < valid_tokens; t++) { + const long long offset = + (long long)physical_block_id * block_stride + + layer_offset + + (long long)t * token_stride + + kv_head_idx * head_dim + + tid; + + float dot = 0.0f; + if (tid < head_dim) { + float k_val = key_value_cache[offset]; + dot = q_val * k_val; + } + + float dot_sum = BlockReduce(temp_storage).Sum(dot); + __syncthreads(); + if (tid == 0) { + shared_score = dot_sum * scale; + } + __syncthreads(); + + float score = shared_score; + float m_new = max(m_prev, score); + float exp_score = expf(score - m_new); + float correction = expf(m_prev - m_new); + d_prev = d_prev * correction + exp_score; + + float v_val = 0.0f; + if (tid < head_dim) { + v_val = key_value_cache[offset + kv_plane_stride]; + } + acc_out = acc_out * correction + v_val * exp_score; + m_prev = m_new; + } + } + + if (tid < head_dim) { + float final_out = (d_prev > 0.0f) ? (acc_out / d_prev) : 0.0f; + out[(long long)token_idx * num_heads * head_dim + (long long)head_idx * head_dim + tid] = final_out; + } +} + +void paged_attention_prefill_cu( + tensor::Tensor& out, + const tensor::Tensor& query, + const tensor::Tensor& key_value_cache, + const tensor::Tensor& block_tables, + const tensor::Tensor& context_lens, + const tensor::Tensor& seq_ids, + const tensor::Tensor& positions, + int num_tokens, + int num_seqs, + int num_heads, + int num_kv_heads, + int head_dim, + int block_size, + int max_blocks_per_seq, + float scale, + int layer_idx, + int num_layers, + cudaStream_t stream +) { + CHECK_EQ(out.is_empty(), false); + CHECK_EQ(query.is_empty(), false); + CHECK_EQ(key_value_cache.is_empty(), false); + CHECK_EQ(block_tables.is_empty(), false); + CHECK_EQ(context_lens.is_empty(), false); + CHECK_EQ(seq_ids.is_empty(), false); + CHECK_EQ(positions.is_empty(), false); + + dim3 grid(num_tokens, num_heads); + dim3 block(128); + + paged_attention_prefill_kernel_impl<<>>( + out.ptr(), + query.ptr(), + key_value_cache.ptr(), + block_tables.ptr(), + context_lens.ptr(), + seq_ids.ptr(), + positions.ptr(), + num_tokens, + num_seqs, + num_heads, + num_kv_heads, + head_dim, + block_size, + max_blocks_per_seq, + scale, + layer_idx, + num_layers + ); +} + +} \ No newline at end of file diff --git a/kuiper/source/op/kernels/cuda/paged_attention_kernel.cuh b/kuiper/source/op/kernels/cuda/paged_attention_kernel.cuh new file mode 100644 index 0000000..71c29b1 --- /dev/null +++ b/kuiper/source/op/kernels/cuda/paged_attention_kernel.cuh @@ -0,0 +1,103 @@ + +#ifndef PAGED_ATTENTION_KERNEL_CU_H +#define PAGED_ATTENTION_KERNEL_CU_H + +#include "tensor/tensor.h" +#include + +namespace kernel { + +// 声明 CUDA 实现函数,参数需严格匹配 kernels_interface.h 中的 PagedAttentionKernel +void paged_attention_kernel_cu( + tensor::Tensor& out, + const tensor::Tensor& query, + const tensor::Tensor& key_value_cache, + const tensor::Tensor& block_tables, + const tensor::Tensor& context_lens, + int max_context_len, + int num_seqs, + int num_heads, + int num_kv_heads, + int head_dim, + int block_size, + int max_blocks_per_seq, + float scale, + int layer_idx , + int num_layers, + cudaStream_t stream +); + +// Batch KV cache write kernel (legacy version) +// Writes K and V for multiple sequences to paged KV cache +// Assumes token_idx == seq_idx (one token per sequence) +// key: [num_seqs, kv_dim] +// value: [num_seqs, kv_dim] +// positions: [num_seqs] - token position for each sequence +void batch_kv_cache_write_cu( + tensor::Tensor& key_value_cache, // [Total_Blocks, Num_Layers, 2, Block_Size, KV_Dim] + const tensor::Tensor& key, // [num_seqs, kv_dim] + const tensor::Tensor& value, // [num_seqs, kv_dim] + const tensor::Tensor& block_tables, // [num_seqs, max_blocks_per_seq] + const tensor::Tensor& positions, // [num_seqs] - token positions + int num_seqs, + int num_kv_heads, + int head_dim, + int block_size, + int max_blocks_per_seq, + int layer_idx, + int num_layers, + cudaStream_t stream +); + +// Batch KV cache write kernel v2 (unified version) +// Supports chunked prefill and continuous batching +// Uses seq_ids to map tokens to sequences +// key: [num_tokens, kv_dim] +// value: [num_tokens, kv_dim] +// positions: [num_tokens] - position for each token +// seq_ids: [num_tokens] - sequence index for each token +void batch_kv_cache_write_v2_cu( + tensor::Tensor& key_value_cache, // [Total_Blocks, Num_Layers, 2, Block_Size, KV_Dim] + const tensor::Tensor& key, // [num_tokens, kv_dim] + const tensor::Tensor& value, // [num_tokens, kv_dim] + const tensor::Tensor& block_tables, // [num_seqs, max_blocks_per_seq] + const tensor::Tensor& positions, // [num_tokens] - token positions + const tensor::Tensor& seq_ids, // [num_tokens] - sequence index for each token + int num_tokens, + int num_kv_heads, + int head_dim, + int block_size, + int max_blocks_per_seq, + int layer_idx, + int num_layers, + cudaStream_t stream +); + +// Paged attention prefill kernel +// Supports multiple query tokens per sequence with causal mask +// query: [num_tokens, num_heads, head_dim] +// Uses positions for causal masking (each query only attends to positions <= its own) +void paged_attention_prefill_cu( + tensor::Tensor& out, // [num_tokens, num_heads, head_dim] + const tensor::Tensor& query, // [num_tokens, num_heads, head_dim] + const tensor::Tensor& key_value_cache, // [Total_Blocks, Num_Layers, 2, Block_Size, KV_Dim] + const tensor::Tensor& block_tables, // [num_seqs, max_blocks_per_seq] + const tensor::Tensor& context_lens, // [num_seqs] - context length for each sequence + const tensor::Tensor& seq_ids, // [num_tokens] - sequence index for each token + const tensor::Tensor& positions, // [num_tokens] - position for each token (for causal mask) + int num_tokens, + int num_seqs, + int num_heads, + int num_kv_heads, + int head_dim, + int block_size, + int max_blocks_per_seq, + float scale, + int layer_idx, + int num_layers, + cudaStream_t stream +); + +} // namespace kernel + +#endif // PAGED_ATTENTION_KERNEL_CU_H diff --git a/kuiper/source/op/kernels/cuda/rope_kernel.cu b/kuiper/source/op/kernels/cuda/rope_kernel.cu index d704d24..6e9822e 100644 --- a/kuiper/source/op/kernels/cuda/rope_kernel.cu +++ b/kuiper/source/op/kernels/cuda/rope_kernel.cu @@ -168,4 +168,132 @@ void rope_kernel_cu(int32_t dim, int32_t kv_dim, int32_t head_size, const tensor cos_cache.ptr()); } } + +// ============================================================================ +// Batched RoPE Kernel for multi-token processing +// ============================================================================ +// Grid: (num_tokens) +// Block: processes one token's Q and K vectors + +#if defined(LLAMA3_SUPPORT) || defined(QWEN2_SUPPORT) || defined(QWEN3_SUPPORT) +__global__ void rope_kernel_batch_cu_fp32( + int num_tokens, int dim, int kv_dim, int head_size, + float* input_q, // [num_tokens, dim] + float* input_k, // [num_tokens, kv_dim] + const int* positions, // [num_tokens] + const float* sin_cache, // [max_seq_len, head_size] + const float* cos_cache) { + + int token_idx = blockIdx.x; + int tid = threadIdx.x; + + if (token_idx >= num_tokens) return; + + int pos = positions[token_idx]; + + int num_heads = dim / head_size; + int head_pair_count = head_size / 2; + int total_pairs = num_heads * head_pair_count; + + // Each thread handles one (head, pair) combination + for (int idx = tid; idx < total_pairs; idx += blockDim.x) { + int head_idx = idx / head_pair_count; + int head_dim = idx % head_pair_count; + + int i = head_idx * head_size; + int v0_idx = i + head_dim; + int v1_idx = i + head_dim + head_size / 2; + + float fci = sin_cache[pos * head_size + head_dim * 2]; + float fcr = cos_cache[pos * head_size + head_dim * 2]; + + int rotn = i < kv_dim ? 2 : 1; + + for (int v = 0; v < rotn; v++) { + float* vec = (v == 0) ? (input_q + token_idx * dim) : (input_k + token_idx * kv_dim); + float v0 = vec[v0_idx]; + float v1 = vec[v1_idx]; + vec[v0_idx] = fcr * v0 - fci * v1; + vec[v1_idx] = fcr * v1 + fci * v0; + } + } +} +#else +// LLama2 version +__global__ void rope_kernel_batch_cu_fp32( + int num_tokens, int dim, int kv_dim, int head_size, + float* input_q, // [num_tokens, dim] + float* input_k, // [num_tokens, kv_dim] + const int* positions, // [num_tokens] + const float* sin_cache, // [max_seq_len, head_size] + const float* cos_cache) { + + int token_idx = blockIdx.x; + int tid = threadIdx.x; + + if (token_idx >= num_tokens) return; + + int pos = positions[token_idx]; + float* q_ptr = input_q + token_idx * dim; + float* k_ptr = input_k + token_idx * kv_dim; + + // Process Q vector + for (int idx = tid * 2; idx < dim; idx += blockDim.x * 2) { + int head_dim = idx % head_size; + float fci = sin_cache[pos * head_size + head_dim]; + float fcr = cos_cache[pos * head_size + head_dim]; + + float2* vec_ptr = reinterpret_cast(q_ptr + idx); + float2 vec_value = *vec_ptr; + *vec_ptr = make_float2(vec_value.x * fcr - vec_value.y * fci, + vec_value.x * fci + vec_value.y * fcr); + } + + // Process K vector + for (int idx = tid * 2; idx < kv_dim; idx += blockDim.x * 2) { + int head_dim = idx % head_size; + float fci = sin_cache[pos * head_size + head_dim]; + float fcr = cos_cache[pos * head_size + head_dim]; + + float2* vec_ptr = reinterpret_cast(k_ptr + idx); + float2 vec_value = *vec_ptr; + *vec_ptr = make_float2(vec_value.x * fcr - vec_value.y * fci, + vec_value.x * fci + vec_value.y * fcr); + } +} +#endif + +void rope_kernel_batch_cu(int32_t num_tokens, int32_t dim, int32_t kv_dim, int32_t head_size, + const tensor::Tensor& input_q, const tensor::Tensor& input_k, + const tensor::Tensor& positions, const tensor::Tensor& sin_cache, + const tensor::Tensor& cos_cache, void* stream) { + CHECK(!input_q.is_empty()); + CHECK(!input_k.is_empty()); + CHECK(!positions.is_empty()); + CHECK(!sin_cache.is_empty()); + CHECK(!cos_cache.is_empty()); + + int threads = 128; + int blocks = num_tokens; // One block per token + + if (stream) { + cudaStream_t stream_ = static_cast(stream); + rope_kernel_batch_cu_fp32<<>>( + num_tokens, dim, kv_dim, head_size, + const_cast(input_q.ptr()), + const_cast(input_k.ptr()), + positions.ptr(), + sin_cache.ptr(), + cos_cache.ptr()); + } else { + rope_kernel_batch_cu_fp32<<>>( + num_tokens, dim, kv_dim, head_size, + const_cast(input_q.ptr()), + const_cast(input_k.ptr()), + positions.ptr(), + sin_cache.ptr(), + cos_cache.ptr()); + } +} + } // namespace kernel \ No newline at end of file diff --git a/kuiper/source/op/kernels/cuda/rope_kernel.cuh b/kuiper/source/op/kernels/cuda/rope_kernel.cuh index 4189226..2d12db0 100644 --- a/kuiper/source/op/kernels/cuda/rope_kernel.cuh +++ b/kuiper/source/op/kernels/cuda/rope_kernel.cuh @@ -2,10 +2,21 @@ #define ROPE_KERNEL_CU_CUH #include "tensor/tensor.h" namespace kernel { + +// Original single-token RoPE void rope_kernel_cu(int32_t dim, int32_t kv_dim, int32_t head_size, const tensor::Tensor& input_q, const tensor::Tensor& input_k, const tensor::Tensor& input_pos, const tensor::Tensor& sin_cache, const tensor::Tensor& cos_cache, void* stream); +// Batched RoPE for multi-token processing (Prefill or multi-sequence Decode) +// input_q: [num_tokens, dim] +// input_k: [num_tokens, kv_dim] +// positions: [num_tokens] - position for each token +void rope_kernel_batch_cu(int32_t num_tokens, int32_t dim, int32_t kv_dim, int32_t head_size, + const tensor::Tensor& input_q, const tensor::Tensor& input_k, + const tensor::Tensor& positions, const tensor::Tensor& sin_cache, + const tensor::Tensor& cos_cache, void* stream); + void sin_cos_cache_calc_cu(int head_size, int max_seq_len, const tensor::Tensor& sin_cache, const tensor::Tensor& cos_cache, cudaStream_t stream); diff --git a/kuiper/source/op/kernels/kernels_interface.h b/kuiper/source/op/kernels/kernels_interface.h index 7c503c5..5dba02a 100644 --- a/kuiper/source/op/kernels/kernels_interface.h +++ b/kuiper/source/op/kernels/kernels_interface.h @@ -38,6 +38,12 @@ typedef void (*RoPEKernel)(int32_t dim, int32_t kv_dim, int32_t head_size, const tensor::Tensor& input_pos, const tensor::Tensor& sin_cache, const tensor::Tensor& cos_cache, void* stream); +// Batched RoPE kernel for multi-token processing +typedef void (*RoPEBatchKernel)(int32_t num_tokens, int32_t dim, int32_t kv_dim, int32_t head_size, + const tensor::Tensor& input_q, const tensor::Tensor& input_k, + const tensor::Tensor& positions, const tensor::Tensor& sin_cache, + const tensor::Tensor& cos_cache, void* stream); + typedef void (*ScaleKernel)(float scale, const tensor::Tensor& input, void* stream); typedef void (*SoftmaxInplaceKernel)(const tensor::Tensor& input, void* stream); @@ -46,6 +52,16 @@ typedef void (*ScaleSumKernel)(const tensor::Tensor& value, const tensor::Tensor const tensor::Tensor& output, int t, int size, int stride, void* stream); +// GEMM kernel for multi-token matrix multiplication (Prefill phase) +// Input: [num_tokens, in_dim], Weight: [out_dim, in_dim], Output: [num_tokens, out_dim] +typedef void (*GemmKernel)(const tensor::Tensor& input, const tensor::Tensor& weight, + const tensor::Tensor& output, const CudaConfig* config); + +// Batched GEMV kernel for multi-sequence decode phase +// Input: [batch_size, in_dim], Weight: [out_dim, in_dim], Output: [batch_size, out_dim] +typedef void (*GemvBatchedKernel)(const tensor::Tensor& input, const tensor::Tensor& weight, + const tensor::Tensor& output, const CudaConfig* config); + void softmax_inplace_cpu(const float* input_ptr, size_t size); AddKernel get_add_kernel(base::DeviceType device_type); @@ -62,6 +78,9 @@ RMSNormKernel get_rmsnorm_kernel(base::DeviceType device_type); RoPEKernel get_rope_kernel(base::DeviceType device_type); +// Get batched RoPE kernel for multi-token processing +RoPEBatchKernel get_rope_batch_kernel(base::DeviceType device_type); + ScaleKernel get_scale_kernel(base::DeviceType device_type); SoftmaxInplaceKernel get_softmax_kernel(base::DeviceType device_type); @@ -71,5 +90,12 @@ SwigluKernel get_swiglu_kernel(base::DeviceType device_type, void* stream = null ScaleSumKernel get_scale_sum_kernel(base::DeviceType device_type); RMSNormKernelDim get_rmsnorm_dim_kernel(base::DeviceType device_type); + +// Get GEMM kernel for multi-token matrix multiplication +GemmKernel get_gemm_kernel(base::DeviceType device_type); + +// Get batched GEMV kernel for multi-sequence decode +GemvBatchedKernel get_gemv_batched_kernel(base::DeviceType device_type); + } // namespace kernel #endif // KERNELS_INTERFACE_H diff --git a/kuiper/source/op/kernels/kernels_interfaces.cpp b/kuiper/source/op/kernels/kernels_interfaces.cpp index 6001dd3..9131fa0 100644 --- a/kuiper/source/op/kernels/kernels_interfaces.cpp +++ b/kuiper/source/op/kernels/kernels_interfaces.cpp @@ -11,6 +11,7 @@ #include "cpu/swiglu_kernel.h" #include "cuda/add_kernel.cuh" #include "cuda/emb_kernel.cuh" +#include "cuda/gemm_kernel.cuh" #include "cuda/matmul_kernel.cuh" #include "cuda/mha_kernel.cuh" #include "cuda/rmsnorm_kernel.cuh" @@ -82,6 +83,15 @@ RoPEKernel get_rope_kernel(base::DeviceType device_type) { } } +RoPEBatchKernel get_rope_batch_kernel(base::DeviceType device_type) { + if (device_type == base::DeviceType::kDeviceCUDA) { + return rope_kernel_batch_cu; + } else { + LOG(FATAL) << "Batched RoPE kernel is only supported on CUDA device."; + return nullptr; + } +} + ScaleKernel get_scale_kernel(base::DeviceType device_type) { if (device_type == base::DeviceType::kDeviceCPU) { return scale_inplace_cpu; @@ -140,4 +150,22 @@ ScaleSumKernel get_scale_sum_kernel(base::DeviceType device_type) { } } +GemmKernel get_gemm_kernel(base::DeviceType device_type) { + if (device_type == base::DeviceType::kDeviceCUDA) { + return gemm_kernel_cu; + } else { + LOG(FATAL) << "GEMM kernel is only supported on CUDA device."; + return nullptr; + } +} + +GemvBatchedKernel get_gemv_batched_kernel(base::DeviceType device_type) { + if (device_type == base::DeviceType::kDeviceCUDA) { + return gemv_batched_kernel_cu; + } else { + LOG(FATAL) << "Batched GEMV kernel is only supported on CUDA device."; + return nullptr; + } +} + } // namespace kernel diff --git a/kuiper/source/op/swiglu.cpp b/kuiper/source/op/swiglu.cpp index 977b49a..b290a3b 100644 --- a/kuiper/source/op/swiglu.cpp +++ b/kuiper/source/op/swiglu.cpp @@ -45,3 +45,11 @@ base::Status SwiGLULayer::forward() { } } // namespace op + + + + + + + + diff --git a/kuiper/source/tensor/tensor.cpp b/kuiper/source/tensor/tensor.cpp index 3c2336d..e36f39a 100644 --- a/kuiper/source/tensor/tensor.cpp +++ b/kuiper/source/tensor/tensor.cpp @@ -101,6 +101,30 @@ Tensor::Tensor(base::DataType data_type, std::vector dims, bool need_al } } +Tensor::Tensor(base::DataType data_type, size_t size, bool need_alloc, + std::shared_ptr alloc, void* ptr) + : data_type_(data_type), size_(size) { + // For large tensors, store size as a single dimension (may overflow int32_t, but size_ is correct) + // dims_ is used for shape info, size_ is used for actual allocation + if (size <= static_cast(INT32_MAX)) { + dims_.push_back(static_cast(size)); + } else { + // Store as two dimensions to avoid int32 overflow in dims_ + // This is just for bookkeeping; actual size is in size_ + dims_.push_back(INT32_MAX); + dims_.push_back(static_cast((size + INT32_MAX - 1) / INT32_MAX)); + } + if (need_alloc && alloc) { + allocate(alloc); + } else { + if (ptr != nullptr) { + CHECK(need_alloc == false) + << "The need_alloc is true when ptr parameter is not a null pointer."; + init_buffer(alloc, data_type_, need_alloc, ptr); + } + } +} + void Tensor::to_cuda(cudaStream_t stream) { CHECK_NE(buffer_, nullptr); const base::DeviceType device_type = this->device_type(); diff --git a/test/test_op/test_cu_gemm.cpp b/test/test_op/test_cu_gemm.cpp new file mode 100644 index 0000000..8aa6869 --- /dev/null +++ b/test/test_op/test_cu_gemm.cpp @@ -0,0 +1,345 @@ +#include +#include +#include +#include +#include +#include "../source/op/kernels/kernels_interface.h" +#include "base/buffer.h" + +using namespace kernel; + +// CPU reference implementation for GEMM +// C[M,N] = A[M,K] x B[N,K]^T (B stored as [N,K]) +void gemm_cpu_reference(const float* A, const float* B, float* C, + int M, int K, int N) { + for (int m = 0; m < M; ++m) { + for (int n = 0; n < N; ++n) { + float sum = 0.0f; + for (int k = 0; k < K; ++k) { + // A[m,k] * B[n,k] + sum += A[m * K + k] * B[n * K + k]; + } + C[m * N + n] = sum; + } + } +} + +// Test basic GEMM with small matrices +TEST(test_gemm_cu, gemm_basic_small) { + auto alloc_cu = base::CUDADeviceAllocatorFactory::get_instance(); + auto alloc_cpu = base::CPUDeviceAllocatorFactory::get_instance(); + + const int M = 4; // num_tokens + const int K = 8; // in_dim + const int N = 6; // out_dim + + // Create input tensors on CPU + tensor::Tensor input(base::DataType::kDataTypeFp32, M, K, true, alloc_cpu); + tensor::Tensor weight(base::DataType::kDataTypeFp32, N, K, true, alloc_cpu); + tensor::Tensor output_cpu(base::DataType::kDataTypeFp32, M, N, true, alloc_cpu); + + // Initialize with simple values + for (int i = 0; i < M * K; ++i) { + input.index(i) = static_cast(i % 7); + } + for (int i = 0; i < N * K; ++i) { + weight.index(i) = static_cast((i % 5) - 2); + } + + // CPU reference + gemm_cpu_reference(input.ptr(), weight.ptr(), + output_cpu.ptr(), M, K, N); + + // Copy to GPU + input.to_cuda(nullptr); + weight.to_cuda(nullptr); + + tensor::Tensor output_cu(base::DataType::kDataTypeFp32, M, N, true, alloc_cu); + + // Run GEMM kernel + // Note: CudaConfig destructor will destroy the stream + CudaConfig* config = new CudaConfig; + cudaStreamCreate(&config->stream); + kernel::get_gemm_kernel(base::DeviceType::kDeviceCUDA)(input, weight, output_cu, config); + cudaStreamSynchronize(config->stream); + + // Copy back and compare + output_cu.to_cpu(); + + for (int i = 0; i < M * N; ++i) { + ASSERT_NEAR(output_cu.index(i), output_cpu.index(i), 1e-4f) + << "Mismatch at index " << i; + } + + delete config; +} + +// Test GEMM with typical LLM dimensions +TEST(test_gemm_cu, gemm_llm_dimensions) { + auto alloc_cu = base::CUDADeviceAllocatorFactory::get_instance(); + auto alloc_cpu = base::CPUDeviceAllocatorFactory::get_instance(); + + const int M = 32; // num_tokens (small batch for testing) + const int K = 256; // in_dim (reduced for testing) + const int N = 512; // out_dim + + tensor::Tensor input(base::DataType::kDataTypeFp32, M, K, true, alloc_cpu); + tensor::Tensor weight(base::DataType::kDataTypeFp32, N, K, true, alloc_cpu); + tensor::Tensor output_cpu(base::DataType::kDataTypeFp32, M, N, true, alloc_cpu); + + // Random initialization + std::mt19937 gen(42); + std::uniform_real_distribution dist(-1.0f, 1.0f); + + for (int i = 0; i < M * K; ++i) { + input.index(i) = dist(gen); + } + for (int i = 0; i < N * K; ++i) { + weight.index(i) = dist(gen); + } + + // CPU reference + gemm_cpu_reference(input.ptr(), weight.ptr(), + output_cpu.ptr(), M, K, N); + + // GPU computation + input.to_cuda(nullptr); + weight.to_cuda(nullptr); + + tensor::Tensor output_cu(base::DataType::kDataTypeFp32, M, N, true, alloc_cu); + + CudaConfig* config = new CudaConfig; + cudaStreamCreate(&config->stream); + kernel::get_gemm_kernel(base::DeviceType::kDeviceCUDA)(input, weight, output_cu, config); + cudaStreamSynchronize(config->stream); + + output_cu.to_cpu(); + + // Compare with tolerance for floating point + float max_diff = 0.0f; + for (int i = 0; i < M * N; ++i) { + float diff = std::abs(output_cu.index(i) - output_cpu.index(i)); + max_diff = std::max(max_diff, diff); + ASSERT_NEAR(output_cu.index(i), output_cpu.index(i), 1e-3f) + << "Mismatch at index " << i << ", diff = " << diff; + } + LOG(INFO) << "Max difference: " << max_diff; + + delete config; +} + +// Test GEMM with single token (should fall back to GEMV) +TEST(test_gemm_cu, gemm_single_token_fallback) { + auto alloc_cu = base::CUDADeviceAllocatorFactory::get_instance(); + auto alloc_cpu = base::CPUDeviceAllocatorFactory::get_instance(); + + const int K = 128; // in_dim + const int N = 64; // out_dim + + // Single token: 1D input + tensor::Tensor input(base::DataType::kDataTypeFp32, K, true, alloc_cpu); + tensor::Tensor weight(base::DataType::kDataTypeFp32, N, K, true, alloc_cpu); + tensor::Tensor output_cpu(base::DataType::kDataTypeFp32, N, true, alloc_cpu); + + std::mt19937 gen(123); + std::uniform_real_distribution dist(-1.0f, 1.0f); + + for (int i = 0; i < K; ++i) { + input.index(i) = dist(gen); + } + for (int i = 0; i < N * K; ++i) { + weight.index(i) = dist(gen); + } + + // CPU reference (M=1) + gemm_cpu_reference(input.ptr(), weight.ptr(), + output_cpu.ptr(), 1, K, N); + + input.to_cuda(nullptr); + weight.to_cuda(nullptr); + + tensor::Tensor output_cu(base::DataType::kDataTypeFp32, N, true, alloc_cu); + + CudaConfig* config = new CudaConfig; + cudaStreamCreate(&config->stream); + kernel::get_gemm_kernel(base::DeviceType::kDeviceCUDA)(input, weight, output_cu, config); + cudaStreamSynchronize(config->stream); + + output_cu.to_cpu(); + + for (int i = 0; i < N; ++i) { + ASSERT_NEAR(output_cu.index(i), output_cpu.index(i), 1e-4f) + << "Mismatch at index " << i; + } + + delete config; +} + +// Test batched GEMV for decode phase +TEST(test_gemm_cu, gemv_batched_decode) { + auto alloc_cu = base::CUDADeviceAllocatorFactory::get_instance(); + auto alloc_cpu = base::CPUDeviceAllocatorFactory::get_instance(); + + const int batch_size = 8; // num sequences + const int K = 256; // in_dim + const int N = 128; // out_dim + + tensor::Tensor input(base::DataType::kDataTypeFp32, batch_size, K, true, alloc_cpu); + tensor::Tensor weight(base::DataType::kDataTypeFp32, N, K, true, alloc_cpu); + tensor::Tensor output_cpu(base::DataType::kDataTypeFp32, batch_size, N, true, alloc_cpu); + + std::mt19937 gen(456); + std::uniform_real_distribution dist(-1.0f, 1.0f); + + for (int i = 0; i < batch_size * K; ++i) { + input.index(i) = dist(gen); + } + for (int i = 0; i < N * K; ++i) { + weight.index(i) = dist(gen); + } + + // CPU reference + gemm_cpu_reference(input.ptr(), weight.ptr(), + output_cpu.ptr(), batch_size, K, N); + + input.to_cuda(nullptr); + weight.to_cuda(nullptr); + + tensor::Tensor output_cu(base::DataType::kDataTypeFp32, batch_size, N, true, alloc_cu); + + CudaConfig* config = new CudaConfig; + cudaStreamCreate(&config->stream); + kernel::get_gemv_batched_kernel(base::DeviceType::kDeviceCUDA)(input, weight, output_cu, config); + cudaStreamSynchronize(config->stream); + + output_cu.to_cpu(); + + float max_diff = 0.0f; + for (int i = 0; i < batch_size * N; ++i) { + float diff = std::abs(output_cu.index(i) - output_cpu.index(i)); + max_diff = std::max(max_diff, diff); + ASSERT_NEAR(output_cu.index(i), output_cpu.index(i), 1e-3f) + << "Mismatch at index " << i; + } + LOG(INFO) << "Batched GEMV max difference: " << max_diff; + + delete config; +} + +// Test GEMM with non-aligned dimensions (edge cases) +TEST(test_gemm_cu, gemm_non_aligned) { + auto alloc_cu = base::CUDADeviceAllocatorFactory::get_instance(); + auto alloc_cpu = base::CPUDeviceAllocatorFactory::get_instance(); + + // Non-power-of-2 dimensions + const int M = 17; // num_tokens + const int K = 67; // in_dim + const int N = 33; // out_dim + + tensor::Tensor input(base::DataType::kDataTypeFp32, M, K, true, alloc_cpu); + tensor::Tensor weight(base::DataType::kDataTypeFp32, N, K, true, alloc_cpu); + tensor::Tensor output_cpu(base::DataType::kDataTypeFp32, M, N, true, alloc_cpu); + + std::mt19937 gen(789); + std::uniform_real_distribution dist(-0.5f, 0.5f); + + for (int i = 0; i < M * K; ++i) { + input.index(i) = dist(gen); + } + for (int i = 0; i < N * K; ++i) { + weight.index(i) = dist(gen); + } + + gemm_cpu_reference(input.ptr(), weight.ptr(), + output_cpu.ptr(), M, K, N); + + input.to_cuda(nullptr); + weight.to_cuda(nullptr); + + tensor::Tensor output_cu(base::DataType::kDataTypeFp32, M, N, true, alloc_cu); + + CudaConfig* config = new CudaConfig; + cudaStreamCreate(&config->stream); + kernel::get_gemm_kernel(base::DeviceType::kDeviceCUDA)(input, weight, output_cu, config); + cudaStreamSynchronize(config->stream); + + output_cu.to_cpu(); + + for (int i = 0; i < M * N; ++i) { + ASSERT_NEAR(output_cu.index(i), output_cpu.index(i), 1e-3f) + << "Mismatch at index " << i; + } + + delete config; +} + +// Test larger GEMM for performance validation +TEST(test_gemm_cu, gemm_large_prefill) { + auto alloc_cu = base::CUDADeviceAllocatorFactory::get_instance(); + auto alloc_cpu = base::CPUDeviceAllocatorFactory::get_instance(); + + const int M = 128; // num_tokens (prefill) + const int K = 512; // in_dim + const int N = 512; // out_dim + + tensor::Tensor input(base::DataType::kDataTypeFp32, M, K, true, alloc_cpu); + tensor::Tensor weight(base::DataType::kDataTypeFp32, N, K, true, alloc_cpu); + tensor::Tensor output_cpu(base::DataType::kDataTypeFp32, M, N, true, alloc_cpu); + + std::mt19937 gen(999); + std::uniform_real_distribution dist(-1.0f, 1.0f); + + for (int i = 0; i < M * K; ++i) { + input.index(i) = dist(gen); + } + for (int i = 0; i < N * K; ++i) { + weight.index(i) = dist(gen); + } + + gemm_cpu_reference(input.ptr(), weight.ptr(), + output_cpu.ptr(), M, K, N); + + input.to_cuda(nullptr); + weight.to_cuda(nullptr); + + tensor::Tensor output_cu(base::DataType::kDataTypeFp32, M, N, true, alloc_cu); + + CudaConfig* config = new CudaConfig; + cudaStreamCreate(&config->stream); + + // Warmup + kernel::get_gemm_kernel(base::DeviceType::kDeviceCUDA)(input, weight, output_cu, config); + cudaStreamSynchronize(config->stream); + + // Timing + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + + const int num_iterations = 10; + cudaEventRecord(start, config->stream); + for (int i = 0; i < num_iterations; ++i) { + kernel::get_gemm_kernel(base::DeviceType::kDeviceCUDA)(input, weight, output_cu, config); + } + cudaEventRecord(stop, config->stream); + cudaEventSynchronize(stop); + + float milliseconds = 0; + cudaEventElapsedTime(&milliseconds, start, stop); + LOG(INFO) << "GEMM [" << M << "x" << K << "] x [" << N << "x" << K << "]^T: " + << milliseconds / num_iterations << " ms per iteration"; + + output_cu.to_cpu(); + + float max_diff = 0.0f; + for (int i = 0; i < M * N; ++i) { + float diff = std::abs(output_cu.index(i) - output_cpu.index(i)); + max_diff = std::max(max_diff, diff); + } + LOG(INFO) << "Max difference: " << max_diff; + ASSERT_LT(max_diff, 1e-2f) << "Results differ too much from CPU reference"; + + cudaEventDestroy(start); + cudaEventDestroy(stop); + delete config; +} diff --git a/test/test_op/test_cu_rope_batch.cpp b/test/test_op/test_cu_rope_batch.cpp new file mode 100644 index 0000000..27c6fa4 --- /dev/null +++ b/test/test_op/test_cu_rope_batch.cpp @@ -0,0 +1,223 @@ +#include +#include +#include +#include +#include +#include "../source/op/kernels/kernels_interface.h" +#include "../source/op/kernels/cuda/rope_kernel.cuh" +#include "base/buffer.h" + +using namespace kernel; + +// Test batched RoPE kernel +TEST(test_rope_batch_cu, rope_batch_basic) { + auto alloc_cu = base::CUDADeviceAllocatorFactory::get_instance(); + auto alloc_cpu = base::CPUDeviceAllocatorFactory::get_instance(); + + const int num_tokens = 4; + const int num_heads = 8; + const int head_size = 64; + const int dim = num_heads * head_size; // 512 + const int num_kv_heads = 4; + const int kv_dim = num_kv_heads * head_size; // 256 + const int max_seq_len = 128; + + // Create tensors + tensor::Tensor input_q(base::DataType::kDataTypeFp32, num_tokens, dim, true, alloc_cpu); + tensor::Tensor input_k(base::DataType::kDataTypeFp32, num_tokens, kv_dim, true, alloc_cpu); + tensor::Tensor positions(base::DataType::kDataTypeInt32, num_tokens, true, alloc_cpu); + tensor::Tensor sin_cache(base::DataType::kDataTypeFp32, max_seq_len, head_size, true, alloc_cpu); + tensor::Tensor cos_cache(base::DataType::kDataTypeFp32, max_seq_len, head_size, true, alloc_cpu); + + // Initialize Q and K with random values + std::mt19937 gen(42); + std::uniform_real_distribution dist(-1.0f, 1.0f); + + for (int i = 0; i < num_tokens * dim; ++i) { + input_q.index(i) = dist(gen); + } + for (int i = 0; i < num_tokens * kv_dim; ++i) { + input_k.index(i) = dist(gen); + } + + // Set positions: [0, 1, 2, 3] + for (int i = 0; i < num_tokens; ++i) { + positions.index(i) = i; + } + + // Clone for comparison + tensor::Tensor input_q_ref = input_q.clone(); + tensor::Tensor input_k_ref = input_k.clone(); + + // Move to GPU + input_q.to_cuda(nullptr); + input_k.to_cuda(nullptr); + positions.to_cuda(nullptr); + sin_cache.to_cuda(nullptr); + cos_cache.to_cuda(nullptr); + + // Calculate sin/cos cache on GPU + CudaConfig* config = new CudaConfig; + cudaStreamCreate(&config->stream); + + kernel::sin_cos_cache_calc_cu(head_size, max_seq_len, sin_cache, cos_cache, config->stream); + cudaStreamSynchronize(config->stream); + + // Run batched RoPE kernel + kernel::get_rope_batch_kernel(base::DeviceType::kDeviceCUDA)( + num_tokens, dim, kv_dim, head_size, + input_q, input_k, positions, sin_cache, cos_cache, config->stream); + cudaStreamSynchronize(config->stream); + + // Copy results back + input_q.to_cpu(); + input_k.to_cpu(); + + // Compute reference using single-token RoPE kernel for each token + for (int t = 0; t < num_tokens; ++t) { + // Create single-token tensors + tensor::Tensor q_single(base::DataType::kDataTypeFp32, dim, true, alloc_cpu); + tensor::Tensor k_single(base::DataType::kDataTypeFp32, kv_dim, true, alloc_cpu); + // pos_single must stay on CPU because rope_kernel_cu reads it on host + tensor::Tensor pos_single(base::DataType::kDataTypeInt32, 1, true, alloc_cpu); + + // Copy data + for (int i = 0; i < dim; ++i) { + q_single.index(i) = input_q_ref.index(t * dim + i); + } + for (int i = 0; i < kv_dim; ++i) { + k_single.index(i) = input_k_ref.index(t * kv_dim + i); + } + pos_single.index(0) = t; + + // Move Q and K to GPU (pos stays on CPU!) + q_single.to_cuda(nullptr); + k_single.to_cuda(nullptr); + + // Run single-token RoPE + kernel::get_rope_kernel(base::DeviceType::kDeviceCUDA)( + dim, kv_dim, head_size, q_single, k_single, pos_single, + sin_cache, cos_cache, config->stream); + cudaStreamSynchronize(config->stream); + + // Copy back + q_single.to_cpu(); + k_single.to_cpu(); + + // Compare Q + for (int i = 0; i < dim; ++i) { + ASSERT_NEAR(input_q.index(t * dim + i), q_single.index(i), 1e-5f) + << "Q mismatch at token " << t << ", index " << i; + } + + // Compare K + for (int i = 0; i < kv_dim; ++i) { + ASSERT_NEAR(input_k.index(t * kv_dim + i), k_single.index(i), 1e-5f) + << "K mismatch at token " << t << ", index " << i; + } + } + + LOG(INFO) << "Batched RoPE test passed for " << num_tokens << " tokens"; + delete config; +} + +// Test batched RoPE with non-sequential positions (for multi-sequence decode) +TEST(test_rope_batch_cu, rope_batch_multi_sequence) { + auto alloc_cu = base::CUDADeviceAllocatorFactory::get_instance(); + auto alloc_cpu = base::CPUDeviceAllocatorFactory::get_instance(); + + const int num_tokens = 4; // 4 sequences, each with 1 token + const int num_heads = 8; + const int head_size = 64; + const int dim = num_heads * head_size; + const int num_kv_heads = 4; + const int kv_dim = num_kv_heads * head_size; + const int max_seq_len = 256; + + tensor::Tensor input_q(base::DataType::kDataTypeFp32, num_tokens, dim, true, alloc_cpu); + tensor::Tensor input_k(base::DataType::kDataTypeFp32, num_tokens, kv_dim, true, alloc_cpu); + tensor::Tensor positions(base::DataType::kDataTypeInt32, num_tokens, true, alloc_cpu); + tensor::Tensor sin_cache(base::DataType::kDataTypeFp32, max_seq_len, head_size, true, alloc_cpu); + tensor::Tensor cos_cache(base::DataType::kDataTypeFp32, max_seq_len, head_size, true, alloc_cpu); + + std::mt19937 gen(123); + std::uniform_real_distribution dist(-1.0f, 1.0f); + + for (int i = 0; i < num_tokens * dim; ++i) { + input_q.index(i) = dist(gen); + } + for (int i = 0; i < num_tokens * kv_dim; ++i) { + input_k.index(i) = dist(gen); + } + + // Non-sequential positions (simulating different sequences at different positions) + // Seq 0 at pos 10, Seq 1 at pos 25, Seq 2 at pos 5, Seq 3 at pos 100 + int pos_values[] = {10, 25, 5, 100}; + positions.index(0) = pos_values[0]; + positions.index(1) = pos_values[1]; + positions.index(2) = pos_values[2]; + positions.index(3) = pos_values[3]; + + tensor::Tensor input_q_ref = input_q.clone(); + tensor::Tensor input_k_ref = input_k.clone(); + + input_q.to_cuda(nullptr); + input_k.to_cuda(nullptr); + positions.to_cuda(nullptr); + sin_cache.to_cuda(nullptr); + cos_cache.to_cuda(nullptr); + + CudaConfig* config = new CudaConfig; + cudaStreamCreate(&config->stream); + + kernel::sin_cos_cache_calc_cu(head_size, max_seq_len, sin_cache, cos_cache, config->stream); + cudaStreamSynchronize(config->stream); + + kernel::get_rope_batch_kernel(base::DeviceType::kDeviceCUDA)( + num_tokens, dim, kv_dim, head_size, + input_q, input_k, positions, sin_cache, cos_cache, config->stream); + cudaStreamSynchronize(config->stream); + + input_q.to_cpu(); + input_k.to_cpu(); + + // Verify each token independently + for (int t = 0; t < num_tokens; ++t) { + tensor::Tensor q_single(base::DataType::kDataTypeFp32, dim, true, alloc_cpu); + tensor::Tensor k_single(base::DataType::kDataTypeFp32, kv_dim, true, alloc_cpu); + // pos_single must stay on CPU! + tensor::Tensor pos_single(base::DataType::kDataTypeInt32, 1, true, alloc_cpu); + + for (int i = 0; i < dim; ++i) { + q_single.index(i) = input_q_ref.index(t * dim + i); + } + for (int i = 0; i < kv_dim; ++i) { + k_single.index(i) = input_k_ref.index(t * kv_dim + i); + } + pos_single.index(0) = pos_values[t]; + + q_single.to_cuda(nullptr); + k_single.to_cuda(nullptr); + // pos_single stays on CPU + + kernel::get_rope_kernel(base::DeviceType::kDeviceCUDA)( + dim, kv_dim, head_size, q_single, k_single, pos_single, + sin_cache, cos_cache, config->stream); + cudaStreamSynchronize(config->stream); + + q_single.to_cpu(); + k_single.to_cpu(); + + for (int i = 0; i < dim; ++i) { + ASSERT_NEAR(input_q.index(t * dim + i), q_single.index(i), 1e-5f) + << "Q mismatch at token " << t << " (pos=" << pos_values[t] << "), index " << i; + } + for (int i = 0; i < kv_dim; ++i) { + ASSERT_NEAR(input_k.index(t * kv_dim + i), k_single.index(i), 1e-5f) + << "K mismatch at token " << t << " (pos=" << pos_values[t] << "), index " << i; + } + } + + LOG(INFO) << "Multi-sequence batched RoPE test passed"; + delete config; +}