From 04b0b6876f16d7249de6f74e2601f45e7d25fa07 Mon Sep 17 00:00:00 2001 From: ttaohe Date: Thu, 18 Dec 2025 17:12:50 +0800 Subject: [PATCH 1/4] dev: pass unitest --- .../ops/rearrange/nvidia/rearrange_kernel.cuh | 125 ++++++- .../ops/rearrange/nvidia/rearrange_nvidia.cu | 346 +++++++++++++++++- 2 files changed, 454 insertions(+), 17 deletions(-) diff --git a/src/infiniop/ops/rearrange/nvidia/rearrange_kernel.cuh b/src/infiniop/ops/rearrange/nvidia/rearrange_kernel.cuh index f273bfeba..34e233996 100644 --- a/src/infiniop/ops/rearrange/nvidia/rearrange_kernel.cuh +++ b/src/infiniop/ops/rearrange/nvidia/rearrange_kernel.cuh @@ -8,8 +8,8 @@ #define ARRAY_TYPE_SIZE size_t // 与 DEFINE_KERNELS_BY_CONSTRAINT 耦合,需要同时修改 -#define MAX_BLOCK_ARRAY_SIZE 5 -#define MAX_GRID_ARRAY_SIZE 5 +#define MAX_BLOCK_ARRAY_SIZE 6 +#define MAX_GRID_ARRAY_SIZE 6 template struct ArrayStruct { @@ -185,32 +185,145 @@ struct Constraint { DEFINE_REARRANGE_KERNEL(double4, constraint_num, block_array_size, grid_array_size) // 与 MAX_BLOCK_ARRAY_SIZE 和 MAX_GRID_ARRAY_SIZE 耦合,需要同时修改 -// 为1-5和1-5的所有组合生成内核 +// 为1-6和1-6的所有组合生成内核 DEFINE_KERNELS_BY_CONSTRAINT(1, 1) DEFINE_KERNELS_BY_CONSTRAINT(1, 2) DEFINE_KERNELS_BY_CONSTRAINT(1, 3) DEFINE_KERNELS_BY_CONSTRAINT(1, 4) DEFINE_KERNELS_BY_CONSTRAINT(1, 5) +DEFINE_KERNELS_BY_CONSTRAINT(1, 6) DEFINE_KERNELS_BY_CONSTRAINT(2, 1) DEFINE_KERNELS_BY_CONSTRAINT(2, 2) DEFINE_KERNELS_BY_CONSTRAINT(2, 3) DEFINE_KERNELS_BY_CONSTRAINT(2, 4) DEFINE_KERNELS_BY_CONSTRAINT(2, 5) +DEFINE_KERNELS_BY_CONSTRAINT(2, 6) DEFINE_KERNELS_BY_CONSTRAINT(3, 1) DEFINE_KERNELS_BY_CONSTRAINT(3, 2) DEFINE_KERNELS_BY_CONSTRAINT(3, 3) DEFINE_KERNELS_BY_CONSTRAINT(3, 4) DEFINE_KERNELS_BY_CONSTRAINT(3, 5) +DEFINE_KERNELS_BY_CONSTRAINT(3, 6) DEFINE_KERNELS_BY_CONSTRAINT(4, 1) DEFINE_KERNELS_BY_CONSTRAINT(4, 2) DEFINE_KERNELS_BY_CONSTRAINT(4, 3) DEFINE_KERNELS_BY_CONSTRAINT(4, 4) DEFINE_KERNELS_BY_CONSTRAINT(4, 5) +DEFINE_KERNELS_BY_CONSTRAINT(4, 6) DEFINE_KERNELS_BY_CONSTRAINT(5, 1) DEFINE_KERNELS_BY_CONSTRAINT(5, 2) DEFINE_KERNELS_BY_CONSTRAINT(5, 3) DEFINE_KERNELS_BY_CONSTRAINT(5, 4) DEFINE_KERNELS_BY_CONSTRAINT(5, 5) +DEFINE_KERNELS_BY_CONSTRAINT(5, 6) +DEFINE_KERNELS_BY_CONSTRAINT(6, 1) +DEFINE_KERNELS_BY_CONSTRAINT(6, 2) +DEFINE_KERNELS_BY_CONSTRAINT(6, 3) +DEFINE_KERNELS_BY_CONSTRAINT(6, 4) +DEFINE_KERNELS_BY_CONSTRAINT(6, 5) +DEFINE_KERNELS_BY_CONSTRAINT(6, 6) + +// ============================================================================== +// 动态Kernel - 支持任意维度的fallback实现 +// ============================================================================== + +template +__global__ void rearrange_dynamic_kernel( + void *__restrict__ dst, + const void *__restrict__ src, + const size_t block_dim, + const size_t block_len_total, + const ARRAY_TYPE_SIZE *block_len, + const ARRAY_TYPE_STRIDE *src_block_stride, + const ARRAY_TYPE_STRIDE *dst_block_stride, + const size_t grid_dim, + const ARRAY_TYPE_SIZE *grid_len, + const ARRAY_TYPE_STRIDE *src_grid_stride, + const ARRAY_TYPE_STRIDE *dst_grid_stride, + const size_t constraint_num, + const Constraint *constraints) { + + size_t thread_idx = threadIdx.x; + if (thread_idx >= block_len_total) { + return; + } + + // 使用共享内存存储grid级别的偏移量 + __shared__ ptrdiff_t shared_src_offset; + __shared__ ptrdiff_t shared_dst_offset; + __shared__ ARRAY_TYPE_SIZE shared_constraints_grid_idx_multiple[2]; // 最多支持2个约束 + + // 第0号线程计算grid偏移 + if (threadIdx.x == 0) { + ptrdiff_t src_offset = 0; + ptrdiff_t dst_offset = 0; + size_t remaining = blockIdx.x; + + // 初始化约束 + for (size_t j = 0; j < constraint_num && j < 2; j++) { + shared_constraints_grid_idx_multiple[j] = 0; + } + + // 计算grid维度的偏移 + for (int i = grid_dim - 1; i >= 0; i--) { + size_t idx = remaining % grid_len[i]; + remaining /= grid_len[i]; + src_offset += idx * src_grid_stride[i]; + dst_offset += idx * dst_grid_stride[i]; + + // 处理约束 + for (size_t j = 0; j < constraint_num && j < 2; j++) { + if (i == constraints[j].grid_idx) { + shared_constraints_grid_idx_multiple[j] = idx * constraints[j].grid_div_block; + } + } + } + + shared_src_offset = src_offset; + shared_dst_offset = dst_offset; + } + + __syncthreads(); + + // 所有线程读取共享内存 + ptrdiff_t src_offset = shared_src_offset; + ptrdiff_t dst_offset = shared_dst_offset; + ARRAY_TYPE_SIZE constraints_grid_idx_multiple[2]; + for (size_t j = 0; j < constraint_num && j < 2; j++) { + constraints_grid_idx_multiple[j] = shared_constraints_grid_idx_multiple[j]; + } + + // 计算block维度的偏移 + size_t remaining = thread_idx; + for (int i = block_dim - 1; i >= 0; i--) { + size_t idx = remaining % block_len[i]; + remaining /= block_len[i]; + + src_offset += idx * src_block_stride[i]; + dst_offset += idx * dst_block_stride[i]; + + // 检查约束 + for (size_t j = 0; j < constraint_num && j < 2; j++) { + if (i == constraints[j].block_idx) { + if (constraints_grid_idx_multiple[j] + idx >= constraints[j].total_len) { + return; + } + } + } + } + + // 执行数据拷贝 + *reinterpret_cast(reinterpret_cast(dst) + dst_offset) = + *reinterpret_cast(reinterpret_cast(src) + src_offset); +} + +// 为不同的数据类型生成动态kernel的模板实例 +template __global__ void rearrange_dynamic_kernel(void *, const void *, const size_t, const size_t, const ARRAY_TYPE_SIZE *, const ARRAY_TYPE_STRIDE *, const ARRAY_TYPE_STRIDE *, const size_t, const ARRAY_TYPE_SIZE *, const ARRAY_TYPE_STRIDE *, const ARRAY_TYPE_STRIDE *, const size_t, const Constraint *); +template __global__ void rearrange_dynamic_kernel(void *, const void *, const size_t, const size_t, const ARRAY_TYPE_SIZE *, const ARRAY_TYPE_STRIDE *, const ARRAY_TYPE_STRIDE *, const size_t, const ARRAY_TYPE_SIZE *, const ARRAY_TYPE_STRIDE *, const ARRAY_TYPE_STRIDE *, const size_t, const Constraint *); +template __global__ void rearrange_dynamic_kernel(void *, const void *, const size_t, const size_t, const ARRAY_TYPE_SIZE *, const ARRAY_TYPE_STRIDE *, const ARRAY_TYPE_STRIDE *, const size_t, const ARRAY_TYPE_SIZE *, const ARRAY_TYPE_STRIDE *, const ARRAY_TYPE_STRIDE *, const size_t, const Constraint *); +template __global__ void rearrange_dynamic_kernel(void *, const void *, const size_t, const size_t, const ARRAY_TYPE_SIZE *, const ARRAY_TYPE_STRIDE *, const ARRAY_TYPE_STRIDE *, const size_t, const ARRAY_TYPE_SIZE *, const ARRAY_TYPE_STRIDE *, const ARRAY_TYPE_STRIDE *, const size_t, const Constraint *); +template __global__ void rearrange_dynamic_kernel(void *, const void *, const size_t, const size_t, const ARRAY_TYPE_SIZE *, const ARRAY_TYPE_STRIDE *, const ARRAY_TYPE_STRIDE *, const size_t, const ARRAY_TYPE_SIZE *, const ARRAY_TYPE_STRIDE *, const ARRAY_TYPE_STRIDE *, const size_t, const Constraint *); +template __global__ void rearrange_dynamic_kernel(void *, const void *, const size_t, const size_t, const ARRAY_TYPE_SIZE *, const ARRAY_TYPE_STRIDE *, const ARRAY_TYPE_STRIDE *, const size_t, const ARRAY_TYPE_SIZE *, const ARRAY_TYPE_STRIDE *, const ARRAY_TYPE_STRIDE *, const size_t, const Constraint *); // 准备参数结构体 struct RearrangeParams { @@ -294,6 +407,9 @@ utils::Result getRearrangeKernel(const RearrangeParams ¶ms) { case 5: \ GET_REARRANGE_KERNEL_BY_CONSTRAINT(block_array_size, 5); \ break; \ + case 6: \ + GET_REARRANGE_KERNEL_BY_CONSTRAINT(block_array_size, 6); \ + break; \ } #define GET_REARRANGE_KERNEL_BY_BLOCK_NUM \ @@ -313,6 +429,9 @@ utils::Result getRearrangeKernel(const RearrangeParams ¶ms) { case 5: \ GET_REARRANGE_KERNEL_BY_GRID_NUM(5); \ break; \ + case 6: \ + GET_REARRANGE_KERNEL_BY_GRID_NUM(6); \ + break; \ } GET_REARRANGE_KERNEL_BY_BLOCK_NUM diff --git a/src/infiniop/ops/rearrange/nvidia/rearrange_nvidia.cu b/src/infiniop/ops/rearrange/nvidia/rearrange_nvidia.cu index 8770f92dd..c4cdb4075 100644 --- a/src/infiniop/ops/rearrange/nvidia/rearrange_nvidia.cu +++ b/src/infiniop/ops/rearrange/nvidia/rearrange_nvidia.cu @@ -5,8 +5,11 @@ #include "rearrange_nvidia.cuh" #include #include +#include +#include #include #include +#include #include namespace op::rearrange::nvidia { @@ -140,17 +143,80 @@ utils::Result prepareRearrangeParams(const utils::RearrangeMeta return std::abs(dims[a].src_stride) < std::abs(dims[b].src_stride); }); - // 维度选择循环 + // 辅助函数:检查block_size是否是warp-friendly的 + auto is_warp_friendly = [](size_t size) -> bool { + // 检查是否是32的倍数,或者是常见的高效配置 + if (size % 32 == 0) return true; + // 小于32的2的幂次也是可以的 + if (size <= 32 && (size & (size - 1)) == 0) return true; + return false; + }; + + // 辅助函数:计算warp效率损失 + auto warp_efficiency = [](size_t size) -> double { + if (size == 0) return 0.0; + size_t warps = (size + 31) / 32; + size_t wasted = warps * 32 - size; + return 1.0 - (double)wasted / (double)(warps * 32); + }; + + // 维度选择循环 - 带warp对齐优化 for (size_t i = 0; i < ndim; ++i) { size_t dim_idx = dim_order[i]; size_t dim_len = shape[dim_idx]; + size_t next_block_size = block_elements * dim_len; + + if (next_block_size <= (size_t)max_threads) { + // 检查是否应该添加这个维度 + bool should_add = true; + + // 优化策略1: 如果当前已经是warp-friendly且下一个会破坏对齐,考虑跳过 + if (is_warp_friendly(block_elements) && !is_warp_friendly(next_block_size)) { + // 检查是否接近max_threads且效率会显著下降 + if (next_block_size > max_threads * 0.95) { + double current_eff = warp_efficiency(block_elements); + double next_eff = warp_efficiency(next_block_size); + + // 如果效率损失超过5%,不添加这个维度 + if (current_eff - next_eff > 0.05) { + should_add = false; + } + } + } + + // 优化策略2: 对于会产生接近1024但不对齐的情况,尝试调整 + if (should_add && next_block_size > 992 && next_block_size < 1024 && !is_warp_friendly(next_block_size)) { + // 如果当前block_elements是warp-friendly的,并且已经足够大(>512),停止添加 + if (is_warp_friendly(block_elements) && block_elements >= 512) { + should_add = false; + } + } - if (block_elements * dim_len <= (size_t)max_threads) { - block_dim_choose[dim_idx] = true; - block_elements *= dim_len; + if (should_add) { + block_dim_choose[dim_idx] = true; + block_elements = next_block_size; + } } else if (block_elements > 1 && dim_len > 1) { // 需要分割此维度 size_t num_per_block = std::min(dim_len, (size_t)max_threads / block_elements); + + // 优化分割:尽量让分割后的block_size是32的倍数 + if (num_per_block > 1 && block_elements > 1) { + size_t target_block_size = block_elements * num_per_block; + + // 如果不是warp-friendly,尝试调整num_per_block + if (!is_warp_friendly(target_block_size) && target_block_size > 32) { + // 尝试找到最近的能产生warp-aligned结果的num_per_block + for (size_t try_num = num_per_block; try_num > 0; try_num--) { + size_t try_size = block_elements * try_num; + if (is_warp_friendly(try_size) && try_size >= 512) { + num_per_block = try_num; + break; + } + } + } + } + if (num_per_block > 0) { size_t num_per_grid = (dim_len + num_per_block - 1) / num_per_block; @@ -174,11 +240,42 @@ utils::Result prepareRearrangeParams(const utils::RearrangeMeta size_t dim_len = shape[dim_idx]; if (dim_len <= (size_t)max_threads) { - block_dim_choose[dim_idx] = true; - block_elements = dim_len; + // 优化:如果dim_len不是warp-friendly,尝试调整到最近的warp边界 + if (!is_warp_friendly(dim_len) && dim_len > 32) { + // 尝试选择32的倍数 + size_t aligned_len = (dim_len / 32) * 32; + if (aligned_len >= 512 && aligned_len <= (size_t)max_threads) { + // 使用分割策略 + size_t num_per_grid = (dim_len + aligned_len - 1) / aligned_len; + SplitDim split_dim = { + dim_idx, + aligned_len, + num_per_grid, + 0, + 0, + dim_len}; + split_dims.push_back(split_dim); + block_elements = aligned_len; + } else { + block_dim_choose[dim_idx] = true; + block_elements = dim_len; + } + } else { + block_dim_choose[dim_idx] = true; + block_elements = dim_len; + } } else { // 需要分割 size_t num_per_block = std::min(dim_len, (size_t)max_threads); + + // 优化:优先选择32的倍数 + if (!is_warp_friendly(num_per_block) && num_per_block > 32) { + size_t aligned = (num_per_block / 32) * 32; + if (aligned >= 512) { + num_per_block = aligned; + } + } + size_t num_per_grid = (dim_len + num_per_block - 1) / num_per_block; SplitDim split_dim = { @@ -294,6 +391,171 @@ utils::Result prepareRearrangeParams(const utils::RearrangeMeta return utils::Result(params); } +// ============================================================================== +// 动态Kernel启动函数 - 支持任意维度 +// ============================================================================== + +template +infiniStatus_t launchDynamicKernel( + void *y, + const void *x, + size_t grid_size, + const RearrangeParams ¶ms, + size_t unit_size, + cudaStream_t stream) { + + // 检查参数有效性 + if (params.block_len.empty() || params.grid_len.empty()) { + return INFINI_STATUS_BAD_PARAM; + } + + size_t block_dim = params.block_len.size(); + size_t grid_dim = params.grid_len.size(); + size_t block_len_total = params.block_len_total; + size_t constraint_num = params.constraints.size(); + + // 准备device端数组 + ARRAY_TYPE_SIZE *d_block_len = nullptr; + ARRAY_TYPE_STRIDE *d_src_block_stride = nullptr; + ARRAY_TYPE_STRIDE *d_dst_block_stride = nullptr; + ARRAY_TYPE_SIZE *d_grid_len = nullptr; + ARRAY_TYPE_STRIDE *d_src_grid_stride = nullptr; + ARRAY_TYPE_STRIDE *d_dst_grid_stride = nullptr; + Constraint *d_constraints = nullptr; + + // 分配设备内存 + CHECK_OR_RETURN(cudaMalloc(&d_block_len, block_dim * sizeof(ARRAY_TYPE_SIZE)) == cudaSuccess, + INFINI_STATUS_INTERNAL_ERROR); + CHECK_OR_RETURN(cudaMalloc(&d_src_block_stride, block_dim * sizeof(ARRAY_TYPE_STRIDE)) == cudaSuccess, + INFINI_STATUS_INTERNAL_ERROR); + CHECK_OR_RETURN(cudaMalloc(&d_dst_block_stride, block_dim * sizeof(ARRAY_TYPE_STRIDE)) == cudaSuccess, + INFINI_STATUS_INTERNAL_ERROR); + CHECK_OR_RETURN(cudaMalloc(&d_grid_len, grid_dim * sizeof(ARRAY_TYPE_SIZE)) == cudaSuccess, + INFINI_STATUS_INTERNAL_ERROR); + CHECK_OR_RETURN(cudaMalloc(&d_src_grid_stride, grid_dim * sizeof(ARRAY_TYPE_STRIDE)) == cudaSuccess, + INFINI_STATUS_INTERNAL_ERROR); + CHECK_OR_RETURN(cudaMalloc(&d_dst_grid_stride, grid_dim * sizeof(ARRAY_TYPE_STRIDE)) == cudaSuccess, + INFINI_STATUS_INTERNAL_ERROR); + + // 使用异步拷贝提升性能 + CHECK_OR_RETURN(cudaMemcpyAsync(d_block_len, params.block_len.data(), + block_dim * sizeof(ARRAY_TYPE_SIZE), + cudaMemcpyHostToDevice, stream) == cudaSuccess, + INFINI_STATUS_INTERNAL_ERROR); + CHECK_OR_RETURN(cudaMemcpyAsync(d_src_block_stride, params.src_block_stride.data(), + block_dim * sizeof(ARRAY_TYPE_STRIDE), + cudaMemcpyHostToDevice, stream) == cudaSuccess, + INFINI_STATUS_INTERNAL_ERROR); + CHECK_OR_RETURN(cudaMemcpyAsync(d_dst_block_stride, params.dst_block_stride.data(), + block_dim * sizeof(ARRAY_TYPE_STRIDE), + cudaMemcpyHostToDevice, stream) == cudaSuccess, + INFINI_STATUS_INTERNAL_ERROR); + CHECK_OR_RETURN(cudaMemcpyAsync(d_grid_len, params.grid_len.data(), + grid_dim * sizeof(ARRAY_TYPE_SIZE), + cudaMemcpyHostToDevice, stream) == cudaSuccess, + INFINI_STATUS_INTERNAL_ERROR); + CHECK_OR_RETURN(cudaMemcpyAsync(d_src_grid_stride, params.src_grid_stride.data(), + grid_dim * sizeof(ARRAY_TYPE_STRIDE), + cudaMemcpyHostToDevice, stream) == cudaSuccess, + INFINI_STATUS_INTERNAL_ERROR); + CHECK_OR_RETURN(cudaMemcpyAsync(d_dst_grid_stride, params.dst_grid_stride.data(), + grid_dim * sizeof(ARRAY_TYPE_STRIDE), + cudaMemcpyHostToDevice, stream) == cudaSuccess, + INFINI_STATUS_INTERNAL_ERROR); + + // 处理约束 + if (constraint_num > 0) { + CHECK_OR_RETURN(cudaMalloc(&d_constraints, constraint_num * sizeof(Constraint)) == cudaSuccess, + INFINI_STATUS_INTERNAL_ERROR); + CHECK_OR_RETURN(cudaMemcpyAsync(d_constraints, params.constraints.data(), + constraint_num * sizeof(Constraint), + cudaMemcpyHostToDevice, stream) == cudaSuccess, + INFINI_STATUS_INTERNAL_ERROR); + } + + // 根据unit_size选择合适的kernel + void *kernel_func = nullptr; + switch (unit_size) { + case 1: + kernel_func = (void *)rearrange_dynamic_kernel; + break; + case 2: + kernel_func = (void *)rearrange_dynamic_kernel; + break; + case 4: + kernel_func = (void *)rearrange_dynamic_kernel; + break; + case 8: + kernel_func = (void *)rearrange_dynamic_kernel; + break; + case 16: + kernel_func = (void *)rearrange_dynamic_kernel; + break; + case 32: + kernel_func = (void *)rearrange_dynamic_kernel; + break; + default: + return INFINI_STATUS_BAD_PARAM; + } + + // 准备kernel参数 + void *args[] = { + &y, &x, + const_cast(&block_dim), + const_cast(&block_len_total), + &d_block_len, + &d_src_block_stride, + &d_dst_block_stride, + const_cast(&grid_dim), + &d_grid_len, + &d_src_grid_stride, + &d_dst_grid_stride, + const_cast(&constraint_num), + &d_constraints}; + + // 启动kernel + cudaError_t launch_result = cudaLaunchKernel( + kernel_func, + static_cast(grid_size), + static_cast(BLOCK_SIZE), + args, 0, stream); + + // 检查kernel启动是否成功 + if (launch_result != cudaSuccess) { + // 清理设备内存 + cudaFree(d_block_len); + cudaFree(d_src_block_stride); + cudaFree(d_dst_block_stride); + cudaFree(d_grid_len); + cudaFree(d_src_grid_stride); + cudaFree(d_dst_grid_stride); + if (d_constraints) { + cudaFree(d_constraints); + } + return INFINI_STATUS_INTERNAL_ERROR; + } + + // 同步stream确保kernel完成后再释放内存 + // 注意:cudaFree会隐式同步,所以这里不需要显式cudaStreamSynchronize + + // 清理设备内存 + cudaFree(d_block_len); + cudaFree(d_src_block_stride); + cudaFree(d_dst_block_stride); + cudaFree(d_grid_len); + cudaFree(d_src_grid_stride); + cudaFree(d_dst_grid_stride); + if (d_constraints) { + cudaFree(d_constraints); + } + + return INFINI_STATUS_SUCCESS; +} + +// ============================================================================== +// 静态Kernel启动函数 - 为常见维度组合优化 +// ============================================================================== + // 带约束的内核启动模板函数 template infiniStatus_t launchKernel( @@ -388,17 +650,73 @@ infiniStatus_t Descriptor::calculate( return INFINI_STATUS_BAD_PARAM; } - // 根据设备属性选择合适的内核 - infiniStatus_t status = INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; - size_t block_size = params.block_len_total; + size_t block_dim = params.block_len.size(); + size_t grid_dim = params.grid_len.size(); + + // 调试输出(通过环境变量REARRANGE_DEBUG=1启用) + static bool debug_enabled = []() { + const char* env = std::getenv("REARRANGE_DEBUG"); + return env != nullptr && std::string(env) == "1"; + }(); + + if (debug_enabled) { + printf("\n=== Rearrange Debug Info ===\n"); + printf("ndim: %zu, unit_size: %zu\n", _meta.ndim(), _meta.unit()); + printf("block_dim: %zu, block_size: %zu\n", block_dim, block_size); + printf("grid_dim: %zu, grid_size: %zu\n", grid_dim, grid_size); + printf("block_len: ["); + for (size_t i = 0; i < params.block_len.size(); ++i) { + printf("%zu%s", params.block_len[i], i + 1 < params.block_len.size() ? ", " : ""); + } + printf("]\n"); + printf("grid_len: ["); + for (size_t i = 0; i < params.grid_len.size(); ++i) { + printf("%zu%s", params.grid_len[i], i + 1 < params.grid_len.size() ? ", " : ""); + } + printf("]\n"); + printf("constraints: %zu\n", params.constraints.size()); + printf("============================\n"); + } + + // 检查是否需要使用动态kernel (fallback策略) + bool use_dynamic_kernel = false; + + // 情况1: 维度超出静态kernel的支持范围 + if (block_dim > MAX_BLOCK_ARRAY_SIZE || grid_dim > MAX_GRID_ARRAY_SIZE) { + use_dynamic_kernel = true; + } + + // 情况2: 约束数量超出静态kernel的支持范围 + if (params.constraints.size() > 2) { + use_dynamic_kernel = true; + } - if (block_size <= CUDA_BLOCK_SIZE_512) { - status = launchKernel(y, x, grid_size, params, _meta.unit(), cuda_stream); - } else if (block_size <= CUDA_BLOCK_SIZE_1024) { - status = launchKernel(y, x, grid_size, params, _meta.unit(), cuda_stream); + if (debug_enabled) { + printf("kernel_type: %s\n", use_dynamic_kernel ? "DYNAMIC" : "STATIC"); + printf("block_size_choice: %s\n", block_size <= CUDA_BLOCK_SIZE_512 ? "512" : "1024"); + } + + infiniStatus_t status = INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; + + if (use_dynamic_kernel) { + // 使用动态kernel处理高维度或特殊情况 + if (block_size <= CUDA_BLOCK_SIZE_512) { + status = launchDynamicKernel(y, x, grid_size, params, _meta.unit(), cuda_stream); + } else if (block_size <= CUDA_BLOCK_SIZE_1024) { + status = launchDynamicKernel(y, x, grid_size, params, _meta.unit(), cuda_stream); + } else { + return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; + } } else { - return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; + // 使用静态优化kernel处理常见情况 + if (block_size <= CUDA_BLOCK_SIZE_512) { + status = launchKernel(y, x, grid_size, params, _meta.unit(), cuda_stream); + } else if (block_size <= CUDA_BLOCK_SIZE_1024) { + status = launchKernel(y, x, grid_size, params, _meta.unit(), cuda_stream); + } else { + return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; + } } return status; From 8cd96bab197693c36a34ea4540b1096334dd7edb Mon Sep 17 00:00:00 2001 From: ttaohe Date: Thu, 18 Dec 2025 18:58:51 +0800 Subject: [PATCH 2/4] feat(rearrange): optimize 6D transpose performance for cuda --- .../ops/rearrange/nvidia/rearrange_kernel.cuh | 6 +- .../ops/rearrange/nvidia/rearrange_nvidia.cu | 157 ++++++++++++- .../nvidia/rearrange_transpose_kernel.cuh | 213 ++++++++++++++++++ 3 files changed, 372 insertions(+), 4 deletions(-) create mode 100644 src/infiniop/ops/rearrange/nvidia/rearrange_transpose_kernel.cuh diff --git a/src/infiniop/ops/rearrange/nvidia/rearrange_kernel.cuh b/src/infiniop/ops/rearrange/nvidia/rearrange_kernel.cuh index 34e233996..20a7a4443 100644 --- a/src/infiniop/ops/rearrange/nvidia/rearrange_kernel.cuh +++ b/src/infiniop/ops/rearrange/nvidia/rearrange_kernel.cuh @@ -182,7 +182,7 @@ struct Constraint { DEFINE_REARRANGE_KERNEL(float1, constraint_num, block_array_size, grid_array_size) \ DEFINE_REARRANGE_KERNEL(float2, constraint_num, block_array_size, grid_array_size) \ DEFINE_REARRANGE_KERNEL(float4, constraint_num, block_array_size, grid_array_size) \ - DEFINE_REARRANGE_KERNEL(double4, constraint_num, block_array_size, grid_array_size) + DEFINE_REARRANGE_KERNEL(double4_32a, constraint_num, block_array_size, grid_array_size) // 与 MAX_BLOCK_ARRAY_SIZE 和 MAX_GRID_ARRAY_SIZE 耦合,需要同时修改 // 为1-6和1-6的所有组合生成内核 @@ -323,7 +323,7 @@ template __global__ void rearrange_dynamic_kernel(void *, const void *, template __global__ void rearrange_dynamic_kernel(void *, const void *, const size_t, const size_t, const ARRAY_TYPE_SIZE *, const ARRAY_TYPE_STRIDE *, const ARRAY_TYPE_STRIDE *, const size_t, const ARRAY_TYPE_SIZE *, const ARRAY_TYPE_STRIDE *, const ARRAY_TYPE_STRIDE *, const size_t, const Constraint *); template __global__ void rearrange_dynamic_kernel(void *, const void *, const size_t, const size_t, const ARRAY_TYPE_SIZE *, const ARRAY_TYPE_STRIDE *, const ARRAY_TYPE_STRIDE *, const size_t, const ARRAY_TYPE_SIZE *, const ARRAY_TYPE_STRIDE *, const ARRAY_TYPE_STRIDE *, const size_t, const Constraint *); template __global__ void rearrange_dynamic_kernel(void *, const void *, const size_t, const size_t, const ARRAY_TYPE_SIZE *, const ARRAY_TYPE_STRIDE *, const ARRAY_TYPE_STRIDE *, const size_t, const ARRAY_TYPE_SIZE *, const ARRAY_TYPE_STRIDE *, const ARRAY_TYPE_STRIDE *, const size_t, const Constraint *); -template __global__ void rearrange_dynamic_kernel(void *, const void *, const size_t, const size_t, const ARRAY_TYPE_SIZE *, const ARRAY_TYPE_STRIDE *, const ARRAY_TYPE_STRIDE *, const size_t, const ARRAY_TYPE_SIZE *, const ARRAY_TYPE_STRIDE *, const ARRAY_TYPE_STRIDE *, const size_t, const Constraint *); +template __global__ void rearrange_dynamic_kernel(void *, const void *, const size_t, const size_t, const ARRAY_TYPE_SIZE *, const ARRAY_TYPE_STRIDE *, const ARRAY_TYPE_STRIDE *, const size_t, const ARRAY_TYPE_SIZE *, const ARRAY_TYPE_STRIDE *, const ARRAY_TYPE_STRIDE *, const size_t, const Constraint *); // 准备参数结构体 struct RearrangeParams { @@ -371,7 +371,7 @@ utils::Result getRearrangeKernel(const RearrangeParams ¶ms) { GET_REARRANGE_KERNEL(float4, block_array_size, grid_array_size, constraint_num); \ break; \ case 32: \ - GET_REARRANGE_KERNEL(double4, block_array_size, grid_array_size, constraint_num); \ + GET_REARRANGE_KERNEL(double4_32a, block_array_size, grid_array_size, constraint_num); \ break; \ default: \ return INFINI_STATUS_BAD_PARAM; \ diff --git a/src/infiniop/ops/rearrange/nvidia/rearrange_nvidia.cu b/src/infiniop/ops/rearrange/nvidia/rearrange_nvidia.cu index c4cdb4075..6f8b52082 100644 --- a/src/infiniop/ops/rearrange/nvidia/rearrange_nvidia.cu +++ b/src/infiniop/ops/rearrange/nvidia/rearrange_nvidia.cu @@ -2,6 +2,7 @@ #include "../../../devices/nvidia/nvidia_kernel_common.cuh" #include "../../../tensor.h" #include "rearrange_kernel.cuh" +#include "rearrange_transpose_kernel.cuh" #include "rearrange_nvidia.cuh" #include #include @@ -74,6 +75,91 @@ struct SplitDim { size_t dim_len; }; +/** + * 检测是否为完全转置模式(行主序到列主序,或反之) + * + * 判断逻辑: + * 1. 检查src_strides和dst_strides是否呈现相反的递增/递减趋势 + * 2. 对于行主序到列主序转换:src_strides递减,dst_strides递增 + * 3. 只对满足条件的大规模转置启用优化 + * + * @param meta 重排元数据 + * @return true如果是完全转置模式且适合优化 + */ +bool isFullTransposePattern(const utils::RearrangeMeta &meta) { + const size_t ndim = meta.ndim(); + + // 只针对特定维度范围启用转置优化 + // 避免对小规模或不适合的case使用 + if (ndim < 4 || ndim > 6) { + return false; + } + + const ptrdiff_t *src_strides = meta.src_strides(); + const ptrdiff_t *dst_strides = meta.dst_strides(); + const ptrdiff_t *idx_strides = meta.idx_strides(); + const size_t unit = meta.unit(); + + // 构建实际的shape + std::vector shape(ndim); + auto prev_idx_stride = meta.count(); + for (size_t i = 0; i < ndim; ++i) { + shape[i] = prev_idx_stride / idx_strides[i]; + prev_idx_stride = idx_strides[i]; + } + + // 计算总元素数,只对大规模转置启用优化 + size_t total_elements = 1; + for (size_t i = 0; i < ndim; ++i) { + total_elements *= shape[i]; + } + + // 只对大规模数据启用转置优化(>100K元素) + if (total_elements < 100000) { + return false; + } + + // 跳过大小为1的维度,构建有效维度的索引 + std::vector valid_dims; + for (size_t i = 0; i < ndim; ++i) { + if (shape[i] > 1) { + valid_dims.push_back(i); + } + } + + // 至少需要4个有效维度才启用转置优化 + if (valid_dims.size() < 4) { + return false; + } + + // 检查是否为row-major到column-major的完全转换 + // 这是最容易识别的模式:src_strides递减,dst_strides递增(或相反) + + // 计算stride的排序 + std::vector> src_stride_order; + std::vector> dst_stride_order; + + for (size_t i = 0; i < valid_dims.size(); ++i) { + size_t dim = valid_dims[i]; + src_stride_order.push_back({std::abs(src_strides[dim]), dim}); + dst_stride_order.push_back({std::abs(dst_strides[dim]), dim}); + } + + std::sort(src_stride_order.begin(), src_stride_order.end()); + std::sort(dst_stride_order.begin(), dst_stride_order.end()); + + // 检查排序后的维度顺序是否完全相反 + bool is_reversed = true; + for (size_t i = 0; i < valid_dims.size(); ++i) { + if (src_stride_order[i].second != dst_stride_order[valid_dims.size() - 1 - i].second) { + is_reversed = false; + break; + } + } + + return is_reversed; +} + /** * 根据给定的元数据准备张量重排参数,该函数主要完成以下工作: * 1. 根据原始元数据调整单元大小,获取更适合GPU处理的单元大小 @@ -492,7 +578,7 @@ infiniStatus_t launchDynamicKernel( kernel_func = (void *)rearrange_dynamic_kernel; break; case 32: - kernel_func = (void *)rearrange_dynamic_kernel; + kernel_func = (void *)rearrange_dynamic_kernel; break; default: return INFINI_STATUS_BAD_PARAM; @@ -617,6 +703,63 @@ infiniStatus_t launchKernel( return INFINI_STATUS_SUCCESS; } +/** + * 启动转置优化的kernel + * 针对完全转置场景使用优化的实现 + */ +infiniStatus_t launchTransposeKernel( + void *y, + const void *x, + const utils::RearrangeMeta &meta, + cudaStream_t stream) { + + const size_t ndim = meta.ndim(); + const size_t unit = meta.unit(); + const ptrdiff_t *idx_strides = meta.idx_strides(); + const ptrdiff_t *src_strides = meta.src_strides(); + const ptrdiff_t *dst_strides = meta.dst_strides(); + + // 构建shape + std::vector shape(ndim); + auto prev_idx_stride = meta.count(); + for (size_t i = 0; i < ndim; ++i) { + shape[i] = prev_idx_stride / idx_strides[i]; + prev_idx_stride = idx_strides[i]; + } + + // 计算总元素数 + size_t total_elements = 1; + for (size_t i = 0; i < ndim; ++i) { + total_elements *= shape[i]; + } + + // 根据ndim和unit选择合适的kernel + if (ndim == 6 && total_elements > 100000 && unit == 4) { + // 大规模6D转置 - F32使用特化kernel + const int threads = 256; + const int blocks = (total_elements + threads - 1) / threads; + + auto *src_f32 = reinterpret_cast(const_cast(x)); + auto *dst_f32 = reinterpret_cast(y); + + transpose_6d_kernel_optimized<<>>( + dst_f32, src_f32, + shape[0], shape[1], shape[2], shape[3], shape[4], shape[5], + src_strides[0] / unit, src_strides[1] / unit, src_strides[2] / unit, + src_strides[3] / unit, src_strides[4] / unit, src_strides[5] / unit, + dst_strides[0] / unit, dst_strides[1] / unit, dst_strides[2] / unit, + dst_strides[3] / unit, dst_strides[4] / unit, dst_strides[5] / unit, + total_elements); + + CHECK_OR_RETURN(cudaGetLastError() == cudaSuccess, INFINI_STATUS_INTERNAL_ERROR); + return INFINI_STATUS_SUCCESS; + } + + // 对于其他情况,暂不使用通用转置(性能不够好) + // 返回错误让它回退到原有实现 + return INFINI_STATUS_BAD_PARAM; +} + infiniStatus_t Descriptor::calculate( void *y, const void *x, @@ -631,6 +774,18 @@ infiniStatus_t Descriptor::calculate( return INFINI_STATUS_SUCCESS; } + // 检测是否为完全转置模式 + // 目前只针对6D大规模转置优化 + if (_meta.ndim() == 6 && isFullTransposePattern(_meta)) { + // 使用优化的转置kernel + auto status = launchTransposeKernel(y, x, _meta, cuda_stream); + // 如果转置kernel成功,直接返回 + if (status == INFINI_STATUS_SUCCESS) { + return status; + } + // 否则回退到通用实现 + } + // 获取设备属性 int max_threads = _opaque->internal->maxThreadsPerBlock(); diff --git a/src/infiniop/ops/rearrange/nvidia/rearrange_transpose_kernel.cuh b/src/infiniop/ops/rearrange/nvidia/rearrange_transpose_kernel.cuh new file mode 100644 index 000000000..91f7b0a08 --- /dev/null +++ b/src/infiniop/ops/rearrange/nvidia/rearrange_transpose_kernel.cuh @@ -0,0 +1,213 @@ +#ifndef __REARRANGE_TRANSPOSE_KERNEL_CUH__ +#define __REARRANGE_TRANSPOSE_KERNEL_CUH__ + +#include "../../../devices/nvidia/nvidia_common.cuh" +#include "../../../devices/nvidia/nvidia_kernel_common.cuh" + +namespace op::rearrange::nvidia { + +// Shared memory tile大小配置 +// 使用32x32 tile以获得最佳bank conflict避免和性能 +constexpr int TILE_DIM = 32; +constexpr int BLOCK_ROWS = 8; // 每个线程处理多行以提高occupancy + +/** + * 转置模式信息结构 + */ +struct TransposeInfo { + size_t ndim; // 维度数 + size_t total_elements; // 总元素数 + size_t element_size; // 元素大小(字节) + + // 将多维索引展平的参数 + std::vector shape; + std::vector src_strides; // 源stride(字节) + std::vector dst_strides; // 目标stride(字节) +}; + +/** + * 2D转置kernel - 使用shared memory tiling + * 适用于简单的2D矩阵转置 + */ +template +__global__ void transpose_2d_kernel_tiled( + T *__restrict__ dst, + const T *__restrict__ src, + size_t rows, + size_t cols, + ptrdiff_t src_row_stride, // 以T为单位 + ptrdiff_t src_col_stride, + ptrdiff_t dst_row_stride, + ptrdiff_t dst_col_stride) { + + __shared__ T tile[TILE_DIM][TILE_DIM + 1]; // +1 避免bank conflict + + // 计算全局坐标 + size_t x = blockIdx.x * TILE_DIM + threadIdx.x; + size_t y = blockIdx.y * TILE_DIM + threadIdx.y; + + // 从源读取到shared memory(coalesced读取) + if (x < cols && y < rows) { + size_t src_idx = y * src_row_stride + x * src_col_stride; + tile[threadIdx.y][threadIdx.x] = src[src_idx]; + } + + __syncthreads(); + + // 转置后的坐标 + x = blockIdx.y * TILE_DIM + threadIdx.x; + y = blockIdx.x * TILE_DIM + threadIdx.y; + + // 从shared memory写入到目标(coalesced写入) + if (x < rows && y < cols) { + size_t dst_idx = y * dst_row_stride + x * dst_col_stride; + dst[dst_idx] = tile[threadIdx.x][threadIdx.y]; + } +} + +/** + * 多维转置kernel - 通用版本 + * 处理任意维度的行列主序转换 + * + * 策略:使用shared memory作为缓冲区,分批处理 + */ +template +__global__ void transpose_nd_kernel( + T *__restrict__ dst, + const T *__restrict__ src, + const size_t *shape, // [ndim] + const ptrdiff_t *src_strides, // [ndim] 以T为单位 + const ptrdiff_t *dst_strides, // [ndim] 以T为单位 + size_t ndim, + size_t total_elements) { + + size_t gid = blockIdx.x * blockDim.x + threadIdx.x; + + if (gid >= total_elements) { + return; + } + + // 计算多维索引 + size_t remaining = gid; + size_t src_offset = 0; + size_t dst_offset = 0; + + // 使用局部数组存储中间结果以减少寄存器压力 + size_t indices[8]; // 支持最多8维 + + // 从线性索引计算多维索引 + for (int i = ndim - 1; i >= 0; i--) { + indices[i] = remaining % shape[i]; + remaining /= shape[i]; + } + + // 计算源和目标偏移 + for (size_t i = 0; i < ndim; i++) { + src_offset += indices[i] * src_strides[i]; + dst_offset += indices[i] * dst_strides[i]; + } + + // 执行拷贝 + dst[dst_offset] = src[src_offset]; +} + +/** + * 向量化的多维转置kernel + * 当数据对齐时使用4元素向量化访问 + */ +template +__global__ void transpose_nd_kernel_vec4( + T *__restrict__ dst, + const T *__restrict__ src, + const size_t *shape, + const ptrdiff_t *src_strides, + const ptrdiff_t *dst_strides, + size_t ndim, + size_t total_elements) { + + size_t gid = (blockIdx.x * blockDim.x + threadIdx.x) * 4; + + if (gid + 3 >= total_elements) { + // 处理边界情况 + for (size_t i = gid; i < total_elements && i < gid + 4; i++) { + size_t remaining = i; + size_t src_offset = 0; + size_t dst_offset = 0; + + size_t indices[8]; + for (int j = ndim - 1; j >= 0; j--) { + indices[j] = remaining % shape[j]; + remaining /= shape[j]; + } + + for (size_t j = 0; j < ndim; j++) { + src_offset += indices[j] * src_strides[j]; + dst_offset += indices[j] * dst_strides[j]; + } + + dst[dst_offset] = src[src_offset]; + } + return; + } + + // 向量化处理4个元素 + for (int k = 0; k < 4; k++) { + size_t i = gid + k; + size_t remaining = i; + size_t src_offset = 0; + size_t dst_offset = 0; + + size_t indices[8]; + for (int j = ndim - 1; j >= 0; j--) { + indices[j] = remaining % shape[j]; + remaining /= shape[j]; + } + + for (size_t j = 0; j < ndim; j++) { + src_offset += indices[j] * src_strides[j]; + dst_offset += indices[j] * dst_strides[j]; + } + + dst[dst_offset] = src[src_offset]; + } +} + +/** + * 6D特化的转置kernel - 针对(3,4,50,50,5,7)这类case优化 + * 使用更好的索引计算和cache策略 + */ +template +__global__ void transpose_6d_kernel_optimized( + T *__restrict__ dst, + const T *__restrict__ src, + size_t d0, size_t d1, size_t d2, size_t d3, size_t d4, size_t d5, + ptrdiff_t s0, ptrdiff_t s1, ptrdiff_t s2, ptrdiff_t s3, ptrdiff_t s4, ptrdiff_t s5, + ptrdiff_t d_s0, ptrdiff_t d_s1, ptrdiff_t d_s2, ptrdiff_t d_s3, ptrdiff_t d_s4, ptrdiff_t d_s5, + size_t total_elements) { + + size_t gid = blockIdx.x * blockDim.x + threadIdx.x; + + if (gid >= total_elements) { + return; + } + + // 手动展开索引计算以减少除法操作 + size_t idx = gid; + size_t i5 = idx % d5; idx /= d5; + size_t i4 = idx % d4; idx /= d4; + size_t i3 = idx % d3; idx /= d3; + size_t i2 = idx % d2; idx /= d2; + size_t i1 = idx % d1; idx /= d1; + size_t i0 = idx; + + // 计算源和目标偏移 + size_t src_offset = i0 * s0 + i1 * s1 + i2 * s2 + i3 * s3 + i4 * s4 + i5 * s5; + size_t dst_offset = i0 * d_s0 + i1 * d_s1 + i2 * d_s2 + i3 * d_s3 + i4 * d_s4 + i5 * d_s5; + + dst[dst_offset] = src[src_offset]; +} + +} // namespace op::rearrange::nvidia + +#endif // __REARRANGE_TRANSPOSE_KERNEL_CUH__ + From 6a526c60d3aa480757083558b5225be336f59219 Mon Sep 17 00:00:00 2001 From: ttaohe Date: Thu, 18 Dec 2025 20:13:08 +0800 Subject: [PATCH 3/4] rearrange op test bench --- test/infinicore/ops/rearrange.py | 119 +++++++++++++++++++++++++++++++ 1 file changed, 119 insertions(+) create mode 100644 test/infinicore/ops/rearrange.py diff --git a/test/infinicore/ops/rearrange.py b/test/infinicore/ops/rearrange.py new file mode 100644 index 000000000..c5854b72a --- /dev/null +++ b/test/infinicore/ops/rearrange.py @@ -0,0 +1,119 @@ +import sys +import os + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) + +import torch +import infinicore +from framework import BaseOperatorTest, TensorSpec, TestCase, GenericTestRunner + + +def row_major_strides(shape): + """生成行优先stride""" + stride = 1 + strides = [1] + for dim in reversed(shape[1:]): + stride *= dim + strides.insert(0, stride) + return tuple(strides) + + +def column_major_strides(shape): + """生成列优先stride""" + stride = 1 + strides = [stride] + for dim in shape[:-1]: + stride *= dim + strides.append(stride) + return tuple(strides) + + +# Test cases: (shape, input_strides, output_strides) +_TEST_CASES_DATA = [ + # 2D转置 + ((100, 100), (1, 100), (100, 1)), + ((2000, 2000), (1, 2000), (2000, 1)), + + # 5D行列转置 + ((3, 4, 7, 53, 9), + row_major_strides((3, 4, 7, 53, 9)), + column_major_strides((3, 4, 7, 53, 9))), + + # 6D行列转置 (主要优化目标) + ((3, 4, 50, 50, 5, 7), + row_major_strides((3, 4, 50, 50, 5, 7)), + column_major_strides((3, 4, 50, 50, 5, 7))), +] + +_TOLERANCE_MAP = { + infinicore.float16: {"atol": 0, "rtol": 0}, + infinicore.float32: {"atol": 0, "rtol": 0}, +} + +_TENSOR_DTYPES = [infinicore.float16, infinicore.float32] + + +def parse_test_cases(): + test_cases = [] + for data in _TEST_CASES_DATA: + shape, in_strides, out_strides = data + + for dtype in _TENSOR_DTYPES: + tol = _TOLERANCE_MAP.get(dtype, {"atol": 0, "rtol": 0}) + + # 输入tensor规格 + in_spec = TensorSpec.from_tensor(shape, in_strides, dtype) + + test_cases.append( + TestCase( + inputs=[in_spec], + kwargs={"output_strides": out_strides}, + output_spec=None, + comparison_target=None, + tolerance=tol, + description=f"rearrange {shape} {dtype}", + ) + ) + + return test_cases + + +class OpTest(BaseOperatorTest): + """Rearrange operator test - stride重排操作""" + + def __init__(self): + super().__init__("Rearrange") + + def get_test_cases(self): + return parse_test_cases() + + def torch_operator(self, input_tensor, output_strides): + """PyTorch实现的rearrange - 使用as_strided""" + # 创建输出tensor + output = torch.empty_like(input_tensor) + # 重新设置stride + output = output.as_strided(input_tensor.shape, output_strides) + # 执行拷贝 + output.copy_(input_tensor) + return output + + def infinicore_operator(self, input_tensor, output_strides): + """InfiniCore实现的rearrange - 使用as_strided""" + # 创建输出tensor + output = infinicore.empty_like(input_tensor) + # 重新设置stride + output = output.as_strided(input_tensor.shape, output_strides) + # 执行拷贝 + output.copy_(input_tensor) + return output + + +def main(): + """Main entry point""" + runner = GenericTestRunner(OpTest) + runner.run_and_exit() + + +if __name__ == "__main__": + main() + From 03a11667ddbea525e3d84b954d89e675b5f9f3ee Mon Sep 17 00:00:00 2001 From: ttaohe Date: Sat, 20 Dec 2025 11:44:06 +0800 Subject: [PATCH 4/4] imp case --- .../ops/rearrange/nvidia/rearrange_nvidia.cu | 256 +++++++++++++-- .../nvidia/rearrange_transpose_kernel.cuh | 306 +++++++++++++++++- test/infinicore/framework/utils.py | 3 +- test/infinicore/ops/rearrange.py | 34 +- 4 files changed, 549 insertions(+), 50 deletions(-) diff --git a/src/infiniop/ops/rearrange/nvidia/rearrange_nvidia.cu b/src/infiniop/ops/rearrange/nvidia/rearrange_nvidia.cu index 6f8b52082..c5698af20 100644 --- a/src/infiniop/ops/rearrange/nvidia/rearrange_nvidia.cu +++ b/src/infiniop/ops/rearrange/nvidia/rearrange_nvidia.cu @@ -114,10 +114,12 @@ bool isFullTransposePattern(const utils::RearrangeMeta &meta) { total_elements *= shape[i]; } - // 只对大规模数据启用转置优化(>100K元素) - if (total_elements < 100000) { - return false; - } + // 只对足够规模的数据启用转置优化 + // 6D: 100K+; 5D: 20K+(用于覆盖 (3,4,7,53,9) 这类中等规模 full-transpose); 4D: 50K+ + size_t threshold = 100000; + if (ndim == 5) threshold = 20000; + if (ndim == 4) threshold = 50000; + if (total_elements < threshold) return false; // 跳过大小为1的维度,构建有效维度的索引 std::vector valid_dims; @@ -732,24 +734,233 @@ infiniStatus_t launchTransposeKernel( for (size_t i = 0; i < ndim; ++i) { total_elements *= shape[i]; } - + + // 2D 大矩阵 row<->col major layout transform: + // - src column-major (stride=(1, M)) -> dst row-major (stride=(N, 1)) + // - src row-major (stride=(N, 1)) -> dst column-major (stride=(1, M)) + // 这类 case 本质等价于一次 2D transpose,必须用 tiled transpose 才能接近带宽上限 + // 2D transpose-like layout transforms: + // Only enable for sufficiently large matrices; for small sizes (e.g., 100x100), + // the generic rearrange kernel can be faster than a shared-memory tiled transpose. + if (ndim == 2 && total_elements >= 65536 && (unit == 2 || unit == 4)) { + const size_t d0 = shape[0]; // M + const size_t d1 = shape[1]; // N + + const ptrdiff_t s0 = src_strides[0]; + const ptrdiff_t s1 = src_strides[1]; + const ptrdiff_t t0 = dst_strides[0]; + const ptrdiff_t t1 = dst_strides[1]; + + // 只支持正 stride(负 stride 暂不处理) + if (s0 > 0 && s1 > 0 && t0 > 0 && t1 > 0) { + const ptrdiff_t u = static_cast(unit); + + // Pattern A: src col-major -> dst row-major + // src: (1, d0) * unit, dst: (d1, 1) * unit + const bool src_is_col_major = (s0 == u) && (s1 == static_cast(d0) * u); + const bool dst_is_row_major = (t0 == static_cast(d1) * u) && (t1 == u); + + // Pattern B: src row-major -> dst col-major + // src: (d1, 1) * unit, dst: (1, d0) * unit + const bool src_is_row_major = (s0 == static_cast(d1) * u) && (s1 == u); + const bool dst_is_col_major = (t0 == u) && (t1 == static_cast(d0) * u); + + // Optional one-shot debug for pattern matching + static bool transpose_debug_enabled = []() { + const char* env = std::getenv("REARRANGE_DEBUG_TRANSPOSE"); + return env != nullptr && std::string(env) == "1"; + }(); + static bool transpose_debug_printed = false; + if (transpose_debug_enabled && !transpose_debug_printed) { + transpose_debug_printed = true; + printf("\n=== Rearrange Transpose Debug ===\n"); + printf("ndim=2, unit=%zu, shape=(%zu,%zu), total=%zu\n", unit, d0, d1, total_elements); + printf("src_strides(bytes)=(%td,%td), dst_strides(bytes)=(%td,%td)\n", s0, s1, t0, t1); + printf("match: src_col=%d dst_row=%d src_row=%d dst_col=%d\n", + (int)src_is_col_major, (int)dst_is_row_major, (int)src_is_row_major, (int)dst_is_col_major); + printf("===============================\n"); + } + + // block=(32,8) + shared-memory tile,来自 CUDA transpose sample + dim3 block(TILE_DIM, BLOCK_ROWS, 1); + dim3 block_small(TILE_DIM_SMALL, BLOCK_ROWS_SMALL, 1); + + if (src_is_col_major && dst_is_row_major) { + // 解释为:transpose A(N,M)->B(M,N) 的 contiguous row-major transpose + const size_t rows = d1; // N + const size_t cols = d0; // M + const bool use_small = (rows <= 256 && cols <= 256); + dim3 grid( + (cols + (use_small ? TILE_DIM_SMALL : TILE_DIM) - 1) / (use_small ? TILE_DIM_SMALL : TILE_DIM), + (rows + (use_small ? TILE_DIM_SMALL : TILE_DIM) - 1) / (use_small ? TILE_DIM_SMALL : TILE_DIM), + 1); + + const ptrdiff_t src_row = src_strides[1] / unit; // M + const ptrdiff_t src_col = src_strides[0] / unit; // 1 + const ptrdiff_t dst_row = dst_strides[0] / unit; // N + const ptrdiff_t dst_col = dst_strides[1] / unit; // 1 + + if (unit == 4) { + using T = uint32_t; + if (use_small) { + transpose_2d_kernel_tiled_small<<>>( + reinterpret_cast(y), + reinterpret_cast(x), + rows, cols, + src_row, src_col, + dst_row, dst_col); + } else { + transpose_2d_kernel_tiled<<>>( + reinterpret_cast(y), + reinterpret_cast(x), + rows, cols, + src_row, src_col, + dst_row, dst_col); + } + } else { + using T = uint16_t; + if (use_small) { + transpose_2d_kernel_tiled_small<<>>( + reinterpret_cast(y), + reinterpret_cast(x), + rows, cols, + src_row, src_col, + dst_row, dst_col); + } else { + transpose_2d_kernel_tiled<<>>( + reinterpret_cast(y), + reinterpret_cast(x), + rows, cols, + src_row, src_col, + dst_row, dst_col); + } + } + CHECK_OR_RETURN(cudaGetLastError() == cudaSuccess, INFINI_STATUS_INTERNAL_ERROR); + return INFINI_STATUS_SUCCESS; + } + + if (src_is_row_major && dst_is_col_major) { + // 解释为:transpose A(M,N)->B(N,M),写入到 dst 的 col-major 布局(等价 row-major N×M) + const size_t rows = d0; // M + const size_t cols = d1; // N + const bool use_small = (rows <= 256 && cols <= 256); + dim3 grid( + (cols + (use_small ? TILE_DIM_SMALL : TILE_DIM) - 1) / (use_small ? TILE_DIM_SMALL : TILE_DIM), + (rows + (use_small ? TILE_DIM_SMALL : TILE_DIM) - 1) / (use_small ? TILE_DIM_SMALL : TILE_DIM), + 1); + + const ptrdiff_t src_row = src_strides[0] / unit; // N + const ptrdiff_t src_col = src_strides[1] / unit; // 1 + const ptrdiff_t dst_row = dst_strides[1] / unit; // M + const ptrdiff_t dst_col = dst_strides[0] / unit; // 1 + + if (unit == 4) { + using T = uint32_t; + if (use_small) { + transpose_2d_kernel_tiled_small<<>>( + reinterpret_cast(y), + reinterpret_cast(x), + rows, cols, + src_row, src_col, + dst_row, dst_col); + } else { + transpose_2d_kernel_tiled<<>>( + reinterpret_cast(y), + reinterpret_cast(x), + rows, cols, + src_row, src_col, + dst_row, dst_col); + } + } else { + using T = uint16_t; + if (use_small) { + transpose_2d_kernel_tiled_small<<>>( + reinterpret_cast(y), + reinterpret_cast(x), + rows, cols, + src_row, src_col, + dst_row, dst_col); + } else { + transpose_2d_kernel_tiled<<>>( + reinterpret_cast(y), + reinterpret_cast(x), + rows, cols, + src_row, src_col, + dst_row, dst_col); + } + } + CHECK_OR_RETURN(cudaGetLastError() == cudaSuccess, INFINI_STATUS_INTERNAL_ERROR); + return INFINI_STATUS_SUCCESS; + } + } + } + // 根据ndim和unit选择合适的kernel - if (ndim == 6 && total_elements > 100000 && unit == 4) { - // 大规模6D转置 - F32使用特化kernel + if (ndim == 5 && total_elements > 20000 && (unit == 2 || unit == 4)) { + constexpr int VEC = 4; const int threads = 256; - const int blocks = (total_elements + threads - 1) / threads; - - auto *src_f32 = reinterpret_cast(const_cast(x)); - auto *dst_f32 = reinterpret_cast(y); + const int blocks = (static_cast((total_elements + VEC - 1) / VEC) + threads - 1) / threads; + + if (unit == 4) { + auto *src_f32 = reinterpret_cast(const_cast(x)); + auto *dst_f32 = reinterpret_cast(y); + + transpose_5d_kernel_inc<<>>( + dst_f32, src_f32, + shape[0], shape[1], shape[2], shape[3], shape[4], + src_strides[0] / unit, src_strides[1] / unit, src_strides[2] / unit, src_strides[3] / unit, src_strides[4] / unit, + dst_strides[0] / unit, dst_strides[1] / unit, dst_strides[2] / unit, dst_strides[3] / unit, dst_strides[4] / unit, + total_elements); + } else { + using T = uint16_t; + auto *src_u16 = reinterpret_cast(const_cast(x)); + auto *dst_u16 = reinterpret_cast(y); + + transpose_5d_kernel_inc<<>>( + dst_u16, src_u16, + shape[0], shape[1], shape[2], shape[3], shape[4], + src_strides[0] / unit, src_strides[1] / unit, src_strides[2] / unit, src_strides[3] / unit, src_strides[4] / unit, + dst_strides[0] / unit, dst_strides[1] / unit, dst_strides[2] / unit, dst_strides[3] / unit, dst_strides[4] / unit, + total_elements); + } + + CHECK_OR_RETURN(cudaGetLastError() == cudaSuccess, INFINI_STATUS_INTERNAL_ERROR); + return INFINI_STATUS_SUCCESS; + } + + if (ndim == 6 && total_elements > 100000 && (unit == 2 || unit == 4)) { + // 大规模6D转置 - F16/F32使用特化kernel + constexpr int VEC = 4; + const int threads = 256; + const int blocks = (static_cast((total_elements + VEC - 1) / VEC) + threads - 1) / threads; - transpose_6d_kernel_optimized<<>>( - dst_f32, src_f32, - shape[0], shape[1], shape[2], shape[3], shape[4], shape[5], - src_strides[0] / unit, src_strides[1] / unit, src_strides[2] / unit, - src_strides[3] / unit, src_strides[4] / unit, src_strides[5] / unit, - dst_strides[0] / unit, dst_strides[1] / unit, dst_strides[2] / unit, - dst_strides[3] / unit, dst_strides[4] / unit, dst_strides[5] / unit, - total_elements); + if (unit == 4) { + auto *src_f32 = reinterpret_cast(const_cast(x)); + auto *dst_f32 = reinterpret_cast(y); + + transpose_6d_kernel_inc<<>>( + dst_f32, src_f32, + shape[0], shape[1], shape[2], shape[3], shape[4], shape[5], + src_strides[0] / unit, src_strides[1] / unit, src_strides[2] / unit, + src_strides[3] / unit, src_strides[4] / unit, src_strides[5] / unit, + dst_strides[0] / unit, dst_strides[1] / unit, dst_strides[2] / unit, + dst_strides[3] / unit, dst_strides[4] / unit, dst_strides[5] / unit, + total_elements); + } else { + // unit == 2 : use uint16_t for bitwise copy (float16/bfloat16 are both 2 bytes here) + using T = uint16_t; + auto *src_u16 = reinterpret_cast(const_cast(x)); + auto *dst_u16 = reinterpret_cast(y); + + transpose_6d_kernel_inc<<>>( + dst_u16, src_u16, + shape[0], shape[1], shape[2], shape[3], shape[4], shape[5], + src_strides[0] / unit, src_strides[1] / unit, src_strides[2] / unit, + src_strides[3] / unit, src_strides[4] / unit, src_strides[5] / unit, + dst_strides[0] / unit, dst_strides[1] / unit, dst_strides[2] / unit, + dst_strides[3] / unit, dst_strides[4] / unit, dst_strides[5] / unit, + total_elements); + } CHECK_OR_RETURN(cudaGetLastError() == cudaSuccess, INFINI_STATUS_INTERNAL_ERROR); return INFINI_STATUS_SUCCESS; @@ -774,9 +985,10 @@ infiniStatus_t Descriptor::calculate( return INFINI_STATUS_SUCCESS; } - // 检测是否为完全转置模式 - // 目前只针对6D大规模转置优化 - if (_meta.ndim() == 6 && isFullTransposePattern(_meta)) { + // 检测是否为完全转置模式: + // - 2D:row<->col major fast-path + // - 4D~6D:full-transpose (stride-order reversed) fast-path + if (_meta.ndim() == 2 || ((_meta.ndim() >= 4 && _meta.ndim() <= 6) && isFullTransposePattern(_meta))) { // 使用优化的转置kernel auto status = launchTransposeKernel(y, x, _meta, cuda_stream); // 如果转置kernel成功,直接返回 diff --git a/src/infiniop/ops/rearrange/nvidia/rearrange_transpose_kernel.cuh b/src/infiniop/ops/rearrange/nvidia/rearrange_transpose_kernel.cuh index 91f7b0a08..37daa7941 100644 --- a/src/infiniop/ops/rearrange/nvidia/rearrange_transpose_kernel.cuh +++ b/src/infiniop/ops/rearrange/nvidia/rearrange_transpose_kernel.cuh @@ -11,6 +11,10 @@ namespace op::rearrange::nvidia { constexpr int TILE_DIM = 32; constexpr int BLOCK_ROWS = 8; // 每个线程处理多行以提高occupancy +// 小矩阵专用 tile 配置:降低 launch/同步开销 +constexpr int TILE_DIM_SMALL = 16; +constexpr int BLOCK_ROWS_SMALL = 8; + /** * 转置模式信息结构 */ @@ -47,9 +51,14 @@ __global__ void transpose_2d_kernel_tiled( size_t y = blockIdx.y * TILE_DIM + threadIdx.y; // 从源读取到shared memory(coalesced读取) - if (x < cols && y < rows) { - size_t src_idx = y * src_row_stride + x * src_col_stride; - tile[threadIdx.y][threadIdx.x] = src[src_idx]; + // 采用 BLOCK_ROWS 提升 occupancy:每个线程负责多行 + #pragma unroll + for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS) { + size_t yj = y + j; + if (x < cols && yj < rows) { + size_t src_idx = yj * src_row_stride + x * src_col_stride; + tile[threadIdx.y + j][threadIdx.x] = src[src_idx]; + } } __syncthreads(); @@ -59,9 +68,123 @@ __global__ void transpose_2d_kernel_tiled( y = blockIdx.x * TILE_DIM + threadIdx.y; // 从shared memory写入到目标(coalesced写入) - if (x < rows && y < cols) { - size_t dst_idx = y * dst_row_stride + x * dst_col_stride; - dst[dst_idx] = tile[threadIdx.x][threadIdx.y]; + #pragma unroll + for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS) { + size_t yj = y + j; + if (x < rows && yj < cols) { + size_t dst_idx = yj * dst_row_stride + x * dst_col_stride; + dst[dst_idx] = tile[threadIdx.x][threadIdx.y + j]; + } + } +} + +/** + * 2D 转置 kernel(小矩阵版本,16x16 tile) + * + * 对于 100x100 这类矩阵,32x32 tile 的 block/smem/sync 开销占比更高; + * 16x16 tile 往往能更接近 PyTorch 的小矩阵性能。 + */ +template +__global__ void transpose_2d_kernel_tiled_small( + T *__restrict__ dst, + const T *__restrict__ src, + size_t rows, + size_t cols, + ptrdiff_t src_row_stride, // 以T为单位 + ptrdiff_t src_col_stride, + ptrdiff_t dst_row_stride, + ptrdiff_t dst_col_stride) { + + __shared__ T tile[TILE_DIM_SMALL][TILE_DIM_SMALL + 1]; + + size_t x = blockIdx.x * TILE_DIM_SMALL + threadIdx.x; + size_t y = blockIdx.y * TILE_DIM_SMALL + threadIdx.y; + + #pragma unroll + for (int j = 0; j < TILE_DIM_SMALL; j += BLOCK_ROWS_SMALL) { + size_t yj = y + j; + if (x < cols && yj < rows) { + size_t src_idx = yj * src_row_stride + x * src_col_stride; + tile[threadIdx.y + j][threadIdx.x] = src[src_idx]; + } + } + + __syncthreads(); + + x = blockIdx.y * TILE_DIM_SMALL + threadIdx.x; + y = blockIdx.x * TILE_DIM_SMALL + threadIdx.y; + + #pragma unroll + for (int j = 0; j < TILE_DIM_SMALL; j += BLOCK_ROWS_SMALL) { + size_t yj = y + j; + if (x < rows && yj < cols) { + size_t dst_idx = yj * dst_row_stride + x * dst_col_stride; + dst[dst_idx] = tile[threadIdx.x][threadIdx.y + j]; + } + } +} + +/** + * 2D 转置 kernel(unit==2 专用,2元素向量化) + * + * block.x=16,每线程处理两个相邻列(共覆盖 32 列),shared memory 仍为 32x(32+1)。 + * 适用于 cols 为偶数且 src/dst 在列方向 stride=1 的典型 transpose(如 row<->col major)。 + */ +__global__ void transpose_2d_kernel_tiled_u16x2( + uint16_t *__restrict__ dst, + const uint16_t *__restrict__ src, + size_t rows, + size_t cols, + ptrdiff_t src_row_stride, // 以 uint16_t 为单位 + ptrdiff_t src_col_stride, + ptrdiff_t dst_row_stride, + ptrdiff_t dst_col_stride) { + + __shared__ uint16_t tile[TILE_DIM][TILE_DIM + 1]; + + // 每线程覆盖 2 列 + size_t x = blockIdx.x * TILE_DIM + threadIdx.x * 2; + size_t y = blockIdx.y * TILE_DIM + threadIdx.y; + + // Load: src -> tile + #pragma unroll + for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS) { + size_t yj = y + j; + if (yj < rows) { + size_t x0 = x; + size_t x1 = x + 1; + if (x0 < cols) { + size_t src_idx0 = yj * src_row_stride + x0 * src_col_stride; + tile[threadIdx.y + j][threadIdx.x * 2] = src[src_idx0]; + } + if (x1 < cols) { + size_t src_idx1 = yj * src_row_stride + x1 * src_col_stride; + tile[threadIdx.y + j][threadIdx.x * 2 + 1] = src[src_idx1]; + } + } + } + + __syncthreads(); + + // Store: tile^T -> dst + x = blockIdx.y * TILE_DIM + threadIdx.x * 2; + y = blockIdx.x * TILE_DIM + threadIdx.y; + + #pragma unroll + for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS) { + size_t yj = y + j; + if (yj < cols) { + size_t x0 = x; + size_t x1 = x + 1; + if (x0 < rows) { + size_t dst_idx0 = yj * dst_row_stride + x0 * dst_col_stride; + dst[dst_idx0] = tile[threadIdx.x * 2][threadIdx.y + j]; + } + if (x1 < rows) { + size_t dst_idx1 = yj * dst_row_stride + x1 * dst_col_stride; + dst[dst_idx1] = tile[threadIdx.x * 2 + 1][threadIdx.y + j]; + } + } } } @@ -207,6 +330,177 @@ __global__ void transpose_6d_kernel_optimized( dst[dst_offset] = src[src_offset]; } +/** + * 6D 转置 kernel(增量进位版本) + * + * 关键点:每个线程处理 VEC 个连续的 gid。 + * 只对第一个 gid 做 div/mod,后续用 +stride 和进位修正 offset,降低索引开销。 + */ +template +__global__ void transpose_6d_kernel_inc( + T *__restrict__ dst, + const T *__restrict__ src, + size_t d0, size_t d1, size_t d2, size_t d3, size_t d4, size_t d5, + ptrdiff_t s0, ptrdiff_t s1, ptrdiff_t s2, ptrdiff_t s3, ptrdiff_t s4, ptrdiff_t s5, + ptrdiff_t t0, ptrdiff_t t1, ptrdiff_t t2, ptrdiff_t t3, ptrdiff_t t4, ptrdiff_t t5, + size_t total_elements) { + + size_t tid = blockIdx.x * blockDim.x + threadIdx.x; + size_t gid0 = tid * static_cast(VEC); + if (gid0 >= total_elements) return; + + // 先把 gid0 解成 6D index(这一步仍然需要 div/mod,但每线程只做一次) + size_t idx = gid0; + size_t i5 = idx % d5; idx /= d5; + size_t i4 = idx % d4; idx /= d4; + size_t i3 = idx % d3; idx /= d3; + size_t i2 = idx % d2; idx /= d2; + size_t i1 = idx % d1; idx /= d1; + size_t i0 = idx; + + ptrdiff_t src_offset = static_cast(i0) * s0 + + static_cast(i1) * s1 + + static_cast(i2) * s2 + + static_cast(i3) * s3 + + static_cast(i4) * s4 + + static_cast(i5) * s5; + + ptrdiff_t dst_offset = static_cast(i0) * t0 + + static_cast(i1) * t1 + + static_cast(i2) * t2 + + static_cast(i3) * t3 + + static_cast(i4) * t4 + + static_cast(i5) * t5; + + #pragma unroll + for (int k = 0; k < VEC; ++k) { + size_t gid = gid0 + static_cast(k); + if (gid >= total_elements) break; + + dst[dst_offset] = src[src_offset]; + + // i5++ 以及进位(同时修正 src/dst offset) + i5 += 1; + src_offset += s5; + dst_offset += t5; + + if (i5 == d5) { + i5 = 0; + src_offset += s4 - static_cast(d5) * s5; + dst_offset += t4 - static_cast(d5) * t5; + i4 += 1; + + if (i4 == d4) { + i4 = 0; + src_offset += s3 - static_cast(d4) * s4; + dst_offset += t3 - static_cast(d4) * t4; + i3 += 1; + + if (i3 == d3) { + i3 = 0; + src_offset += s2 - static_cast(d3) * s3; + dst_offset += t2 - static_cast(d3) * t3; + i2 += 1; + + if (i2 == d2) { + i2 = 0; + src_offset += s1 - static_cast(d2) * s2; + dst_offset += t1 - static_cast(d2) * t2; + i1 += 1; + + if (i1 == d1) { + i1 = 0; + src_offset += s0 - static_cast(d1) * s1; + dst_offset += t0 - static_cast(d1) * t1; + i0 += 1; + } + } + } + } + } + } +} + +/** + * 5D 转置 kernel(增量进位版本) + * + * 每线程处理 VEC 个连续 gid,只对第一个 gid 做 div/mod; + * 后续通过 +stride + 进位修正 offset,减少索引开销。 + */ +template +__global__ void transpose_5d_kernel_inc( + T *__restrict__ dst, + const T *__restrict__ src, + size_t d0, size_t d1, size_t d2, size_t d3, size_t d4, + ptrdiff_t s0, ptrdiff_t s1, ptrdiff_t s2, ptrdiff_t s3, ptrdiff_t s4, + ptrdiff_t t0, ptrdiff_t t1, ptrdiff_t t2, ptrdiff_t t3, ptrdiff_t t4, + size_t total_elements) { + + size_t tid = blockIdx.x * blockDim.x + threadIdx.x; + size_t gid0 = tid * static_cast(VEC); + if (gid0 >= total_elements) return; + + size_t idx = gid0; + size_t i4 = idx % d4; idx /= d4; + size_t i3 = idx % d3; idx /= d3; + size_t i2 = idx % d2; idx /= d2; + size_t i1 = idx % d1; idx /= d1; + size_t i0 = idx; + + ptrdiff_t src_offset = static_cast(i0) * s0 + + static_cast(i1) * s1 + + static_cast(i2) * s2 + + static_cast(i3) * s3 + + static_cast(i4) * s4; + + ptrdiff_t dst_offset = static_cast(i0) * t0 + + static_cast(i1) * t1 + + static_cast(i2) * t2 + + static_cast(i3) * t3 + + static_cast(i4) * t4; + + #pragma unroll + for (int k = 0; k < VEC; ++k) { + size_t gid = gid0 + static_cast(k); + if (gid >= total_elements) break; + + dst[dst_offset] = src[src_offset]; + + // i4++ and carry + i4 += 1; + src_offset += s4; + dst_offset += t4; + + if (i4 == d4) { + i4 = 0; + src_offset += s3 - static_cast(d4) * s4; + dst_offset += t3 - static_cast(d4) * t4; + i3 += 1; + + if (i3 == d3) { + i3 = 0; + src_offset += s2 - static_cast(d3) * s3; + dst_offset += t2 - static_cast(d3) * t3; + i2 += 1; + + if (i2 == d2) { + i2 = 0; + src_offset += s1 - static_cast(d2) * s2; + dst_offset += t1 - static_cast(d2) * t2; + i1 += 1; + + if (i1 == d1) { + i1 = 0; + src_offset += s0 - static_cast(d1) * s1; + dst_offset += t0 - static_cast(d1) * t1; + i0 += 1; + } + } + } + } + } +} + } // namespace op::rearrange::nvidia #endif // __REARRANGE_TRANSPOSE_KERNEL_CUH__ diff --git a/test/infinicore/framework/utils.py b/test/infinicore/framework/utils.py index 8f30c4c78..311384314 100644 --- a/test/infinicore/framework/utils.py +++ b/test/infinicore/framework/utils.py @@ -391,7 +391,8 @@ def rearrange_tensor(tensor, new_strides): new_positions += offset # Copy the original data to the new tensor - new_tensor.view(-1).index_add_(0, new_positions, tensor.view(-1)) + # NOTE: tensor may be non-contiguous; use reshape instead of view. + new_tensor.reshape(-1).index_add_(0, new_positions, tensor.reshape(-1)) new_tensor.set_(new_tensor.untyped_storage(), offset, shape, tuple(new_strides)) return new_tensor diff --git a/test/infinicore/ops/rearrange.py b/test/infinicore/ops/rearrange.py index c5854b72a..d279f3b4f 100644 --- a/test/infinicore/ops/rearrange.py +++ b/test/infinicore/ops/rearrange.py @@ -63,13 +63,15 @@ def parse_test_cases(): # 输入tensor规格 in_spec = TensorSpec.from_tensor(shape, in_strides, dtype) + # 输出tensor规格:预先创建一个具有目标 strides 的 out,避免每次 iteration 分配 + out_spec = TensorSpec.from_tensor(shape, out_strides, dtype) test_cases.append( TestCase( inputs=[in_spec], - kwargs={"output_strides": out_strides}, - output_spec=None, - comparison_target=None, + kwargs={}, # out 由框架根据 output_spec 自动创建并传入 operator + output_spec=out_spec, + comparison_target="out", # in-place(out) benchmark:只测 copy_ 内核 tolerance=tol, description=f"rearrange {shape} {dtype}", ) @@ -87,25 +89,15 @@ def __init__(self): def get_test_cases(self): return parse_test_cases() - def torch_operator(self, input_tensor, output_strides): - """PyTorch实现的rearrange - 使用as_strided""" - # 创建输出tensor - output = torch.empty_like(input_tensor) - # 重新设置stride - output = output.as_strided(input_tensor.shape, output_strides) - # 执行拷贝 - output.copy_(input_tensor) - return output + def torch_operator(self, input_tensor, out): + """PyTorch实现:out 已是目标 strides(由 output_spec 创建)""" + out.copy_(input_tensor) + return out - def infinicore_operator(self, input_tensor, output_strides): - """InfiniCore实现的rearrange - 使用as_strided""" - # 创建输出tensor - output = infinicore.empty_like(input_tensor) - # 重新设置stride - output = output.as_strided(input_tensor.shape, output_strides) - # 执行拷贝 - output.copy_(input_tensor) - return output + def infinicore_operator(self, input_tensor, out): + """InfiniCore实现:out 已是目标 strides(由 output_spec 创建)""" + out.copy_(input_tensor) + return out def main():