diff --git a/executor/op-mem-cuda/src/deepx/dtype_cuda.hpp b/executor/op-mem-cuda/src/deepx/dtype_cuda.hpp index 98f212c..86b248e 100644 --- a/executor/op-mem-cuda/src/deepx/dtype_cuda.hpp +++ b/executor/op-mem-cuda/src/deepx/dtype_cuda.hpp @@ -4,6 +4,7 @@ #include #include #include +#include #include "deepx/dtype.hpp" @@ -56,6 +57,45 @@ namespace deepx struct to_tensor_type> { using type = __nv_fp8_e4m3; }; + + + + template + struct fp8_format_map; + + template <> + struct fp8_format_map<__nv_fp8_e5m2> { + static constexpr __nv_fp8_interpretation_t value = __NV_E5M2; + }; + + template <> + struct fp8_format_map<__nv_fp8_e4m3> { + static constexpr __nv_fp8_interpretation_t value = __NV_E4M3; + }; + + template + struct is_fp8 : std::false_type {}; // 默认 false + + template<> struct is_fp8<__nv_fp8_e4m3> : std::true_type {}; + template<> struct is_fp8<__nv_fp8_e5m2> : std::true_type {}; + + + template + inline constexpr bool is_fp8_v = is_fp8::value; + + template + struct to_half { + static __host__ __device__ __half convert(T a) { + return __nv_cvt_fp8_to_halfraw(static_cast<__nv_fp8_storage_t>(a), fp8_format_map::value); + } + }; + + template + struct to_fp8 { + static __host__ __device__ T convert(half a) { + return static_cast(__nv_cvt_halfraw_to_fp8(a, __NV_SATFINITE, fp8_format_map::value)); + } + }; } #endif // DEEPX_DTYPE_CUDA_HPP diff --git a/executor/op-mem-cuda/src/deepx/tensorfunc/cuda_math.cuh b/executor/op-mem-cuda/src/deepx/tensorfunc/cuda_math.cuh index 9f2c0b1..3e9476d 100644 --- a/executor/op-mem-cuda/src/deepx/tensorfunc/cuda_math.cuh +++ b/executor/op-mem-cuda/src/deepx/tensorfunc/cuda_math.cuh @@ -6,6 +6,7 @@ #include #include #include +#include "deepx/dtype_cuda.hpp" namespace deepx::tensorfunc { @@ -38,24 +39,12 @@ namespace deepx::tensorfunc *out = hsqrt(*a); } - template <> - __device__ __forceinline__ void deepx_sqrt<__nv_fp8_e4m3>(const __nv_fp8_e4m3 *a, __nv_fp8_e4m3 *out) + template > = 0> + __device__ __forceinline__ void deepx_sqrt(const T *a, T *out) { - __half input_fp16 = __nv_cvt_fp8_to_halfraw(static_cast<__nv_fp8_storage_t>(*a), __NV_E4M3); - __half result_fp16 = hsqrt(input_fp16); // CUDA 内置半精度平方根 - *out = static_cast<__nv_fp8_e4m3>(__nv_cvt_halfraw_to_fp8(result_fp16, __NV_SATFINITE, __NV_E4M3)); - } - - template <> - __device__ __forceinline__ void deepx_sqrt<__nv_fp8_e5m2>(const __nv_fp8_e5m2 *a, __nv_fp8_e5m2 *out) - { - __half input_fp16 = __nv_cvt_fp8_to_halfraw(static_cast<__nv_fp8_storage_t>(*a), __NV_E5M2); - - // 2. 执行平方根 - __half result_fp16 = hsqrt(input_fp16); - - // 3. 转回 FP8 → E5M2 格式 - *out =static_cast<__nv_fp8_e5m2>(__nv_cvt_halfraw_to_fp8(result_fp16, __NV_SATFINITE, __NV_E5M2)); + __half input_half = to_half::convert(*a); + __half result_half = hsqrt(input_half); // CUDA 内置半精度平方根 + *out = to_fp8::convert(result_half); } diff --git a/executor/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_compare.cu b/executor/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_compare.cu index 1eaa3dc..dc7d138 100644 --- a/executor/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_compare.cu +++ b/executor/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_compare.cu @@ -1,12 +1,14 @@ #ifndef DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_COMPARE_CU #define DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_COMPARE_CU +#include #include "deepx/tensorfunc/cuda.hpp" #include "deepx/tensorfunc/authors.hpp" #include "deepx/tensorfunc/vector_cuda.cuh" +#include "deepx/dtype_cuda.hpp" namespace deepx::tensorfunc { - template + template , int> = 0> __global__ void max_kernel(const T *A, const T *B, T *C, const int size) { int stride = blockDim.x * gridDim.x; @@ -16,6 +18,20 @@ namespace deepx::tensorfunc } } + template , int> = 0> + __global__ void max_kernel(const T *A, const T *B, T *C, const int size) + { + int stride = blockDim.x * gridDim.x; + for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += stride) + { + __half temp_a = to_half::convert(A[idx]); + __half temp_b = to_half::convert(B[idx]); + __half temp_c = temp_a > temp_b ? temp_a : temp_b; + C[idx] = to_fp8::convert(temp_c); + } + } + + template void launch_max(const T *A, const T *B, T *C, const int size) { @@ -32,8 +48,10 @@ namespace deepx::tensorfunc template void launch_max(const int32_t *A, const int32_t *B, int32_t *C, const int size); template void launch_max(const int16_t *A, const int16_t *B, int16_t *C, const int size); template void launch_max(const int8_t *A, const int8_t *B, int8_t *C, const int size); + template void launch_max<__nv_fp8_e4m3>(const __nv_fp8_e4m3 *A, const __nv_fp8_e4m3 *B, __nv_fp8_e4m3 *C, const int size); + template void launch_max<__nv_fp8_e5m2>(const __nv_fp8_e5m2 *A, const __nv_fp8_e5m2 *B, __nv_fp8_e5m2 *C, const int size); - template + template , int> = 0> __global__ void maxscalar_kernel(const T *A, const T scalar, T *C, const int size) { int stride = blockDim.x * gridDim.x; @@ -43,6 +61,19 @@ namespace deepx::tensorfunc } } + template , int> = 0> + __global__ void maxscalar_kernel(const T *A, const T scalar, T *C, const int size) + { + int stride = blockDim.x * gridDim.x; + for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += stride) + { + __half temp_a = to_half::convert(A[idx]); + __half temp_scalar = to_half::convert(scalar); + __half temp_c = temp_a > temp_scalar ? temp_a : temp_scalar; + C[idx] = to_fp8::convert(temp_c); + } + } + template void launch_maxscalar(const T *A, const T scalar, T *C, const int size) { @@ -59,8 +90,10 @@ namespace deepx::tensorfunc template void launch_maxscalar(const int32_t *A, const int32_t scalar, int32_t *C, const int size); template void launch_maxscalar(const int16_t *A, const int16_t scalar, int16_t *C, const int size); template void launch_maxscalar(const int8_t *A, const int8_t scalar, int8_t *C, const int size); + template void launch_maxscalar<__nv_fp8_e4m3>(const __nv_fp8_e4m3 *A, const __nv_fp8_e4m3 scalar, __nv_fp8_e4m3 *C, const int size); + template void launch_maxscalar<__nv_fp8_e5m2>(const __nv_fp8_e5m2 *A, const __nv_fp8_e5m2 scalar, __nv_fp8_e5m2 *C, const int size); - template + template , int> = 0> __global__ void min_kernel(const T *A, const T *B, T *C, const int size) { int stride = blockDim.x * gridDim.x; @@ -70,6 +103,20 @@ namespace deepx::tensorfunc } } + + template , int> = 0> + __global__ void min_kernel(const T *A, const T *B, T *C, const int size) + { + int stride = blockDim.x * gridDim.x; + for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += stride) + { + __half temp_a = to_half::convert(A[idx]); + __half temp_b = to_half::convert(B[idx]); + __half temp_c = temp_a < temp_b ? temp_a : temp_b; + C[idx] = to_fp8::convert(temp_c); + } + } + template void launch_min(const T *A, const T *B, T *C, const int size) { @@ -86,8 +133,10 @@ namespace deepx::tensorfunc template void launch_min(const int32_t *A, const int32_t *B, int32_t *C, const int size); template void launch_min(const int16_t *A, const int16_t *B, int16_t *C, const int size); template void launch_min(const int8_t *A, const int8_t *B, int8_t *C, const int size); + template void launch_min<__nv_fp8_e4m3>(const __nv_fp8_e4m3 *A, const __nv_fp8_e4m3 *B, __nv_fp8_e4m3 *C, const int size); + template void launch_min<__nv_fp8_e5m2>(const __nv_fp8_e5m2 *A, const __nv_fp8_e5m2 *B, __nv_fp8_e5m2 *C, const int size); - template + template , int> = 0> __global__ void minscalar_kernel(const T *A, const T scalar, T *C, const int size) { int stride = blockDim.x * gridDim.x; @@ -97,6 +146,19 @@ namespace deepx::tensorfunc } } + template , int> = 0> + __global__ void minscalar_kernel(const T *A, const T scalar, T *C, const int size) + { + int stride = blockDim.x * gridDim.x; + for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += stride) + { + __half temp_a = to_half::convert(A[idx]); + __half temp_scalar = to_half::convert(scalar); + __half temp_c = temp_a < temp_scalar ? temp_a : temp_scalar; + C[idx] = to_fp8::convert(temp_c); + } + } + template void launch_minscalar(const T *A, const T scalar, T *C, const int size) { @@ -113,9 +175,11 @@ namespace deepx::tensorfunc template void launch_minscalar(const int32_t *A, const int32_t scalar, int32_t *C, const int size); template void launch_minscalar(const int16_t *A, const int16_t scalar, int16_t *C, const int size); template void launch_minscalar(const int8_t *A, const int8_t scalar, int8_t *C, const int size); + template void launch_minscalar<__nv_fp8_e4m3>(const __nv_fp8_e4m3 *A, const __nv_fp8_e4m3 scalar, __nv_fp8_e4m3 *C, const int size); + template void launch_minscalar<__nv_fp8_e5m2>(const __nv_fp8_e5m2 *A, const __nv_fp8_e5m2 scalar, __nv_fp8_e5m2 *C, const int size); // equal - template + template , int> = 0> __global__ void equalwithepsilon_kernel(const T *A, const T *B, const float epsilon, MaskT *mask, const int size) { int stride = blockDim.x * gridDim.x; @@ -133,7 +197,28 @@ namespace deepx::tensorfunc } } - template + // equal + template , int> = 0> + __global__ void equalwithepsilon_kernel(const T *A, const T *B, const float epsilon, MaskT *mask, const int size) + { + int stride = blockDim.x * gridDim.x; + for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += stride) + { + float diff = fabsf(static_cast(to_half::convert(A[idx])) - static_cast(to_half::convert(B[idx]))); + if (diff < epsilon) + { + mask[idx] = 1; + } + else + { + mask[idx] = 0; + } + } + } + + + + template , int> = 0> __global__ void equal_kernel(const T *A, const T *B, MaskT *mask, const int size) { int stride = blockDim.x * gridDim.x; @@ -143,6 +228,16 @@ namespace deepx::tensorfunc } } + template , int> = 0> + __global__ void equal_kernel(const T *A, const T *B, MaskT *mask, const int size) + { + int stride = blockDim.x * gridDim.x; + for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += stride) + { + mask[idx] = (to_half::convert(A[idx]) == to_half::convert(B[idx])); + } + } + template void launch_equal(const T *A, const T *B, const float epsilon, MaskT *mask, const int size) { @@ -166,6 +261,8 @@ namespace deepx::tensorfunc template void launch_equal(const int32_t *A, const int32_t *B, const float epsilon, bool *mask, const int size); template void launch_equal(const int16_t *A, const int16_t *B, const float epsilon, bool *mask, const int size); template void launch_equal(const int8_t *A, const int8_t *B, const float epsilon, bool *mask, const int size); + template void launch_equal<__nv_fp8_e4m3,bool>(const __nv_fp8_e4m3 *A, const __nv_fp8_e4m3 *B, const float epsilon, bool *mask, const int size); + template void launch_equal<__nv_fp8_e5m2,bool>(const __nv_fp8_e5m2 *A, const __nv_fp8_e5m2 *B, const float epsilon, bool *mask, const int size); // equalscalar template @@ -219,6 +316,8 @@ namespace deepx::tensorfunc template void launch_equalscalar(const int32_t *A, const int32_t scalar, const float epsilon, bool *mask, const int size); template void launch_equalscalar(const int16_t *A, const int16_t scalar, const float epsilon, bool *mask, const int size); template void launch_equalscalar(const int8_t *A, const int8_t scalar, const float epsilon, bool *mask, const int size); + // template void launch_equalscalar<__nv_fp8_e4m3,bool>(const __nv_fp8_e4m3 *A, const __nv_fp8_e4m3 scalar, const float epsilon, bool *mask, const int size); + // template void launch_equalscalar<__nv_fp8_e5m2,bool>(const __nv_fp8_e5m2 *A, const __nv_fp8_e5m2 scalar, const float epsilon, bool *mask, const int size); // not equal template @@ -272,6 +371,8 @@ namespace deepx::tensorfunc template void launch_notequal(const int32_t *A, const int32_t *B, const float epsilon, bool *mask, const int size); template void launch_notequal(const int16_t *A, const int16_t *B, const float epsilon, bool *mask, const int size); template void launch_notequal(const int8_t *A, const int8_t *B, const float epsilon, bool *mask, const int size); + // template void launch_notequal<__nv_fp8_e4m3,bool>(const __nv_fp8_e4m3 *A, const __nv_fp8_e4m3 *B, const float epsilon, bool *mask, const int size); + // template void launch_notequal<__nv_fp8_e5m2,bool>(const __nv_fp8_e5m2 *A, const __nv_fp8_e5m2 *B, const float epsilon, bool *mask, const int size); // notequalscalar template @@ -325,6 +426,8 @@ namespace deepx::tensorfunc template void launch_notequalscalar(const int32_t *A, const int32_t scalar, const float epsilon, bool *mask, const int size); template void launch_notequalscalar(const int16_t *A, const int16_t scalar, const float epsilon, bool *mask, const int size); template void launch_notequalscalar(const int8_t *A, const int8_t scalar, const float epsilon, bool *mask, const int size); + // template void launch_notequalscalar<__nv_fp8_e4m3,bool>(const __nv_fp8_e4m3 *A, const __nv_fp8_e4m3 scalar, const float epsilon, bool *mask, const int size); + // template void launch_notequalscalar<__nv_fp8_e5m2,bool>(const __nv_fp8_e5m2 *A, const __nv_fp8_e5m2 scalar, const float epsilon, bool *mask, const int size); // less template @@ -353,6 +456,8 @@ namespace deepx::tensorfunc template void launch_less(const int32_t *A, const int32_t *B, bool *mask, const int size); template void launch_less(const int16_t *A, const int16_t *B, bool *mask, const int size); template void launch_less(const int8_t *A, const int8_t *B, bool *mask, const int size); + // template void launch_less<__nv_fp8_e4m3,bool>(const __nv_fp8_e4m3 *A, const __nv_fp8_e4m3 *B, bool *mask, const int size); + // template void launch_less<__nv_fp8_e5m2,bool>(const __nv_fp8_e5m2 *A, const __nv_fp8_e5m2 *B, bool *mask, const int size); // lessscalar @@ -382,7 +487,9 @@ namespace deepx::tensorfunc template void launch_lessscalar(const int32_t *A, const int32_t scalar, bool *mask, const int size); template void launch_lessscalar(const int16_t *A, const int16_t scalar, bool *mask, const int size); template void launch_lessscalar(const int8_t *A, const int8_t scalar, bool *mask, const int size); - + // template void launch_lessscalar<__nv_fp8_e4m3,bool>(const __nv_fp8_e4m3 *A, const __nv_fp8_e4m3 scalar, bool *mask, const int size); + // template void launch_lessscalar<__nv_fp8_e5m2,bool>(const __nv_fp8_e5m2 *A, const __nv_fp8_e5m2 scalar, bool *mask, const int size); + // greater template __global__ void greater_kernel(const T *A, const T *B, MaskT *mask, const int size) @@ -410,6 +517,8 @@ namespace deepx::tensorfunc template void launch_greater(const int32_t *A, const int32_t *B, bool *mask, const int size); template void launch_greater(const int16_t *A, const int16_t *B, bool *mask, const int size); template void launch_greater(const int8_t *A, const int8_t *B, bool *mask, const int size); + // template void launch_greater<__nv_fp8_e4m3,bool>(const __nv_fp8_e4m3 *A, const __nv_fp8_e4m3 *B, bool *mask, const int size); + // template void launch_greater<__nv_fp8_e5m2,bool>(const __nv_fp8_e5m2 *A, const __nv_fp8_e5m2 *B, bool *mask, const int size); // greaterscalar template @@ -438,6 +547,8 @@ namespace deepx::tensorfunc template void launch_greaterscalar(const int32_t *A, const int32_t scalar, bool *mask, const int size); template void launch_greaterscalar(const int16_t *A, const int16_t scalar, bool *mask, const int size); template void launch_greaterscalar(const int8_t *A, const int8_t scalar, bool *mask, const int size); + // template void launch_greaterscalar<__nv_fp8_e4m3,bool>(const __nv_fp8_e4m3 *A, const __nv_fp8_e4m3 scalar, bool *mask, const int size); + // template void launch_greaterscalar<__nv_fp8_e5m2,bool>(const __nv_fp8_e5m2 *A, const __nv_fp8_e5m2 scalar, bool *mask, const int size); // switch template @@ -476,7 +587,9 @@ namespace deepx::tensorfunc template void launch_switch(const int16_t **tensorsdata, const int numTensors, const int32_t *cases, int16_t *C, const int size); template void launch_switch(const int8_t **tensorsdata, const int numTensors, const int32_t *cases, int8_t *C, const int size); template void launch_switch(const bool **tensorsdata, const int numTensors, const int32_t *cases, bool *C, const int size); - + // template void launch_switch<__nv_fp8_e4m3,int32_t>(const __nv_fp8_e4m3 **tensorsdata, const int numTensors, const int32_t *cases, __nv_fp8_e4m3 *C, const int size); + // template void launch_switch<__nv_fp8_e5m2,int32_t>(const __nv_fp8_e5m2 **tensorsdata, const int numTensors, const int32_t *cases, __nv_fp8_e5m2 *C, const int size); + template void launch_switch(const double **tensorsdata, const int numTensors, const bool *cases, double *C, const int size); template void launch_switch(const float **tensorsdata, const int numTensors, const bool *cases, float *C, const int size); template void launch_switch(const nv_bfloat16 **tensorsdata, const int numTensors, const bool *cases, nv_bfloat16 *C, const int size); @@ -486,6 +599,7 @@ namespace deepx::tensorfunc template void launch_switch(const int16_t **tensorsdata, const int numTensors, const bool *cases, int16_t *C, const int size); template void launch_switch(const int8_t **tensorsdata, const int numTensors, const bool *cases, int8_t *C, const int size); template void launch_switch(const bool **tensorsdata, const int numTensors, const bool *cases, bool *C, const int size); - + // template void launch_switch<__nv_fp8_e4m3,bool>(const __nv_fp8_e4m3 **tensorsdata, const int numTensors, const bool *cases, __nv_fp8_e4m3 *C, const int size); + // template void launch_switch<__nv_fp8_e5m2,bool>(const __nv_fp8_e5m2 **tensorsdata, const int numTensors, const bool *cases, __nv_fp8_e5m2 *C, const int size); } #endif // DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_COMPARE_CU