From 16cb9fd168d56096c3d06b8e930a167e03921008 Mon Sep 17 00:00:00 2001
From: "Zhe,Wang" <zhe1.wang@intel.com>
Date: Wed, 26 Jul 2023 10:44:03 +0800
Subject: [PATCH 1/5] todo: add cpu-end torch custom fp4 quant ref

---
 .vscode/launch.json   | 16 ++++++++++++++++
 Makefile              |  3 ++-
 csrc/ops.cuh          |  1 +
 tests/fp4_ref.py      | 18 ++++++++++++++++++
 tests/test_modules.py | 20 +++++++++++---------
 5 files changed, 48 insertions(+), 10 deletions(-)
 create mode 100644 .vscode/launch.json
 create mode 100644 tests/fp4_ref.py

diff --git a/.vscode/launch.json b/.vscode/launch.json
new file mode 100644
index 000000000..2b2502c69
--- /dev/null
+++ b/.vscode/launch.json
@@ -0,0 +1,16 @@
+{
+    // Use IntelliSense to learn about possible attributes.
+    // Hover to view descriptions of existing attributes.
+    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "name": "Python: Current File",
+            "type": "python",
+            "request": "launch",
+            "program": "${file}",
+            "console": "integratedTerminal",
+            "justMyCode": false
+        }
+    ]
+}
\ No newline at end of file
diff --git a/Makefile b/Makefile
index 7ccbcb191..8f786442f 100644
--- a/Makefile
+++ b/Makefile
@@ -24,7 +24,8 @@ BUILD_DIR:= $(ROOT_DIR)/build
 FILES_CUDA := $(CSRC)/ops.cu $(CSRC)/kernels.cu
 FILES_CPP := $(CSRC)/common.cpp $(CSRC)/cpu_ops.cpp $(CSRC)/pythonInterface.c
 
-INCLUDE :=  -I $(CUDA_HOME)/include -I $(ROOT_DIR)/csrc -I $(CONDA_PREFIX)/include -I $(ROOT_DIR)/include
+# INCLUDE :=  -I $(CUDA_HOME)/include -I $(ROOT_DIR)/csrc -I $(CONDA_PREFIX)/include -I $(ROOT_DIR)/include
+INCLUDE :=  -I /usr/local/cuda-11.3/include -I $(ROOT_DIR)/csrc -I $(CONDA_PREFIX)/include -I $(ROOT_DIR)/include
 LIB := -L $(CUDA_HOME)/lib64 -lcudart -lcublas -lcublasLt -lcusparse -L $(CONDA_PREFIX)/lib
 
 # NVIDIA NVCC compilation flags
diff --git a/csrc/ops.cuh b/csrc/ops.cuh
index f37b3b3af..0b6b0418d 100644
--- a/csrc/ops.cuh
+++ b/csrc/ops.cuh
@@ -14,6 +14,7 @@
 
 #include <cuda_runtime_api.h>
 #include <cuda_fp16.h>
+#include <cuda_bf16.h>
 #include <cublas_v2.h>
 #include <cublasLt.h>
 #include <cusparse.h>
diff --git a/tests/fp4_ref.py b/tests/fp4_ref.py
new file mode 100644
index 000000000..69db98ce8
--- /dev/null
+++ b/tests/fp4_ref.py
@@ -0,0 +1,18 @@
+import torch
+from torch import Tensor
+def ref_quantizeblockwise_fp4(A: Tensor,absmax: Tensor,out: Tensor,blocksize=64,compress_statics=False,quant_type="fp4"):
+    n = A.numel()
+    input_shape = A.shape
+
+    if absmax is None:
+        blocks = n // blocksize
+        blocks += 1 if n % blocksize > 0 else 0
+        absmax = torch.zeros((blocks,), device=A.device, dtype=torch.float32)
+
+
+    if out is None:
+        out = torch.zeros(((n+1)//2, 1), dtype=torch.uint8, device=A.device)
+
+    assert blocksize in [4096, 2048, 1024, 512, 256, 128, 64]
+
+    
diff --git a/tests/test_modules.py b/tests/test_modules.py
index 7d2d03498..ab0226484 100644
--- a/tests/test_modules.py
+++ b/tests/test_modules.py
@@ -317,16 +317,17 @@ def forward(self, x):
 
 @pytest.mark.parametrize("threshold", values, ids=names)
 def test_linear8bitlt_inference(threshold):
-    l1 = bnb.nn.Linear8bitLt(32, 64, threshold=threshold).cuda().half()
-    assert l1.weight.device.type == "cuda"
-    assert l1.weight.dtype == torch.float16
+    # l1 = bnb.nn.Linear8bitLt(32, 64, threshold=threshold).cuda().half()
+    test=bnb.nn.LinearFP4(32,64).cuda()
+    # assert l1.weight.device.type == "cuda"
+    # assert l1.weight.dtype == torch.float16
 
-    l1.eval()
-    for i in range(100):
-        b1 = torch.randn(16, 8, 32, device="cuda").half()
-        o1 = l1(b1)
-        if i == 1:
-            assert l1.state.CxB is not None
+    # l1.eval()
+    # for i in range(100):
+    #     b1 = torch.randn(16, 8, 32, device="cuda").half()
+    #     o1 = l1(b1)
+    #     if i == 1:
+    #         assert l1.state.CxB is not None
 
 
 def test_linear8bitlt_accumulated_gradient():
@@ -641,3 +642,4 @@ def test_4bit_warnings():
 
 
 
+test_linear8bitlt_inference(0.1)
\ No newline at end of file

From b85b9867c6cbe170976b8a71a0c8b3cbcaa8f540 Mon Sep 17 00:00:00 2001
From: "Zhe,Wang" <zhe1.wang@intel.com>
Date: Wed, 26 Jul 2023 12:47:36 +0800
Subject: [PATCH 2/5] add cpu-end fp4 quant ref interface.

---
 tests/custom_op/CMakeLists.txt             | 14 +++++++
 tests/custom_op/pyt_all_reduce_cpu_test.py |  6 +++
 tests/custom_op/pyt_all_reduce_gpu_test.py |  6 +++
 tests/custom_op/pyt_all_reduce_kernel.cu   | 49 ++++++++++++++++++++++
 tests/custom_op/pyt_all_reduce_kernel.hh   |  9 ++++
 tests/custom_op/pyt_all_reduce_op.cpp      | 18 ++++++++
 tests/custom_op/run_build.sh               |  9 ++++
 7 files changed, 111 insertions(+)
 create mode 100644 tests/custom_op/CMakeLists.txt
 create mode 100644 tests/custom_op/pyt_all_reduce_cpu_test.py
 create mode 100644 tests/custom_op/pyt_all_reduce_gpu_test.py
 create mode 100644 tests/custom_op/pyt_all_reduce_kernel.cu
 create mode 100644 tests/custom_op/pyt_all_reduce_kernel.hh
 create mode 100644 tests/custom_op/pyt_all_reduce_op.cpp
 create mode 100755 tests/custom_op/run_build.sh

diff --git a/tests/custom_op/CMakeLists.txt b/tests/custom_op/CMakeLists.txt
new file mode 100644
index 000000000..a4db9f82d
--- /dev/null
+++ b/tests/custom_op/CMakeLists.txt
@@ -0,0 +1,14 @@
+cmake_minimum_required(VERSION 3.1 FATAL_ERROR)
+project(custom_allreduce_op LANGUAGES CXX CUDA)
+
+find_package(Torch REQUIRED)
+
+# Define our library target
+add_library(custom_allreduce_op SHARED pyt_all_reduce_op.cpp pyt_all_reduce_kernel.cu)
+# Enable C++14
+target_compile_features(custom_allreduce_op PRIVATE cxx_std_14)
+# Link against LibTorch
+target_link_libraries(custom_allreduce_op "${TORCH_LIBRARIES}")
+
+set_property(TARGET torch_cuda PROPERTY INTERFACE_COMPILE_OPTIONS "")
+set_property(TARGET torch_cpu PROPERTY INTERFACE_COMPILE_OPTIONS "")
diff --git a/tests/custom_op/pyt_all_reduce_cpu_test.py b/tests/custom_op/pyt_all_reduce_cpu_test.py
new file mode 100644
index 000000000..043f8e317
--- /dev/null
+++ b/tests/custom_op/pyt_all_reduce_cpu_test.py
@@ -0,0 +1,6 @@
+import torch
+torch.ops.load_library("build/libcustom_allreduce_op.so")
+A = torch.ones(1024, dtype=torch.int)
+b = torch.ops.my_ops.custom_allreduce(A)
+print(b)
+
diff --git a/tests/custom_op/pyt_all_reduce_gpu_test.py b/tests/custom_op/pyt_all_reduce_gpu_test.py
new file mode 100644
index 000000000..d61f73175
--- /dev/null
+++ b/tests/custom_op/pyt_all_reduce_gpu_test.py
@@ -0,0 +1,6 @@
+import torch
+torch.ops.load_library("build/libcustom_allreduce_op.so")
+A = torch.ones(1024, dtype=torch.float32, device='cuda')
+b = torch.ops.my_ops.custom_allreduce(A.half())
+print(b[0]) #.to('cpu'))
+
diff --git a/tests/custom_op/pyt_all_reduce_kernel.cu b/tests/custom_op/pyt_all_reduce_kernel.cu
new file mode 100644
index 000000000..37f17642c
--- /dev/null
+++ b/tests/custom_op/pyt_all_reduce_kernel.cu
@@ -0,0 +1,49 @@
+#include <torch/script.h>
+
+#include "pyt_all_reduce_kernel.hh"
+#define BLOCKX_DIM 256
+
+template <typename scalar_t>
+void cpu_all_reduce(float* sum, scalar_t* data, int n) {
+  scalar_t temp_sum = 0;
+  for (int i = 0; i < n; ++i) {
+    temp_sum += data[i];
+  }
+  *sum = temp_sum;
+}
+
+template <typename scalar_t>
+__global__ void gpu_all_reduce(float* sum, scalar_t* data, int n) {
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+  scalar_t temp = 0;
+  for (int i = idx; i < n; i += stride) {
+    temp += data[i];
+  }
+
+  atomicAdd(sum, temp);
+}
+
+torch::Tensor all_reduce_launcher(torch::Tensor input) {
+  torch::Device device(torch::kCUDA, 0);
+  torch::Tensor output = torch::zeros(1, torch::kFloat);
+  if (input.device() == device) {
+    output = output.to(device);
+    dim3 blockSize(BLOCKX_DIM);
+    dim3 gridSize((input.size(0) + BLOCKX_DIM - 1) / BLOCKX_DIM);
+    const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+        input.type(), "gpu_all_reduce", ([&] {
+          gpu_all_reduce<scalar_t><<<gridSize, blockSize, 0, stream>>>(
+              output.data_ptr<float>(), input.data_ptr<scalar_t>(),
+              input.size(0));
+        }));
+  } else {
+    cpu_all_reduce<int>(output.data_ptr<float>(), input.data_ptr<int>(),
+                        input.size(0));
+  }
+  return output;
+}
+
+void fp4_quantize_launcher(const torch::Tensor& A, const torch::Tensor& absmax,
+                           torch::Tensor& out, int64_t blocksize, int64_t n) {}
\ No newline at end of file
diff --git a/tests/custom_op/pyt_all_reduce_kernel.hh b/tests/custom_op/pyt_all_reduce_kernel.hh
new file mode 100644
index 000000000..d0b26fc1d
--- /dev/null
+++ b/tests/custom_op/pyt_all_reduce_kernel.hh
@@ -0,0 +1,9 @@
+#pragma once
+
+#include <ATen/cuda/CUDAContext.h>
+#include <torch/script.h>
+
+torch::Tensor all_reduce_launcher(torch::Tensor input);
+
+void fp4_quantize_launcher(const torch::Tensor& A, const torch::Tensor& absmax,
+                           torch::Tensor& out, int64_t blocksize, int64_t n);
diff --git a/tests/custom_op/pyt_all_reduce_op.cpp b/tests/custom_op/pyt_all_reduce_op.cpp
new file mode 100644
index 000000000..17e6c83ca
--- /dev/null
+++ b/tests/custom_op/pyt_all_reduce_op.cpp
@@ -0,0 +1,18 @@
+#include <torch/script.h>
+#include <torch/torch.h>
+
+#include "pyt_all_reduce_kernel.hh"
+
+static torch::Tensor custom_allreduce(torch::Tensor input) {
+  return all_reduce_launcher(input);
+}
+
+static void ref_fp4_quantize(const torch::Tensor& A,
+                             const torch::Tensor& absmax, torch::Tensor& out,
+                             int64_t blocksize, int64_t n) {}
+
+// static auto registry = torch::RegisterOperators("myop::skbmm", &skbmm);
+TORCH_LIBRARY(my_ops, m) {
+  m.def("custom_allreduce", &custom_allreduce);
+  m.def("ref_fp4_quantize", &ref_fp4_quantize);
+}
diff --git a/tests/custom_op/run_build.sh b/tests/custom_op/run_build.sh
new file mode 100755
index 000000000..35f068b6f
--- /dev/null
+++ b/tests/custom_op/run_build.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+rm -rf build;
+mkdir build;
+cd build
+
+cmake -DCMAKE_PREFIX_PATH="$(python -c 'import torch; print(torch.__path__[0])')" ..
+
+make -j$(nproc)

From 10b35398dac1c7d6236b4050f01ea578f85e4f55 Mon Sep 17 00:00:00 2001
From: "Zhe,Wang" <zhe1.wang@intel.com>
Date: Wed, 26 Jul 2023 14:05:17 +0800
Subject: [PATCH 3/5] cpu fp4_quantize pass ut

---
 bitsandbytes/functional.py               |  3 +-
 bitsandbytes/nn/modules.py               |  7 +++
 tests/custom_op/pyt_all_reduce_kernel.cu | 69 +++++++++++++++++++++++-
 tests/custom_op/pyt_all_reduce_kernel.hh |  2 +-
 tests/custom_op/pyt_all_reduce_op.cpp    |  7 +--
 5 files changed, 81 insertions(+), 7 deletions(-)

diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
index 837f6bf1c..871915172 100644
--- a/bitsandbytes/functional.py
+++ b/bitsandbytes/functional.py
@@ -838,7 +838,8 @@ def quantize_4bit(A: Tensor, absmax: Tensor = None, out: Tensor = None, blocksiz
     else:
         raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {A.dtype}")
     post_call(A.device)
-
+    print(absmax)
+    print(out)
     datatype = get_4bit_type(quant_type, device=A.device)
 
     if compress_statistics:
diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py
index 3d34bb45f..8551a6def 100644
--- a/bitsandbytes/nn/modules.py
+++ b/bitsandbytes/nn/modules.py
@@ -154,9 +154,16 @@ def __new__(cls, data=None, requires_grad=True, quant_state=None, blocksize=64,
 
     def cuda(self, device):
         w = self.data.contiguous().half().cuda(device)
+        rw=self.data.contiguous()
         w_4bit, quant_state = bnb.functional.quantize_4bit(w, blocksize=self.blocksize, compress_statistics=self.compress_statistics, quant_type=self.quant_type)
         self.data = w_4bit
         self.quant_state = quant_state
+        absmax=torch.zeros(32,dtype=torch.float)
+        out = torch.zeros(1024,1, dtype=torch.uint8)
+        torch.ops.load_library("/home/zhe/bitsandbytes/tests/custom_op/build/libcustom_allreduce_op.so")
+        torch.ops.my_ops.ref_fp4_quantize(rw,absmax,out,64,2048)
+        print(absmax)
+        print(out)
 
         return self
 
diff --git a/tests/custom_op/pyt_all_reduce_kernel.cu b/tests/custom_op/pyt_all_reduce_kernel.cu
index 37f17642c..6c6e6e608 100644
--- a/tests/custom_op/pyt_all_reduce_kernel.cu
+++ b/tests/custom_op/pyt_all_reduce_kernel.cu
@@ -45,5 +45,70 @@ torch::Tensor all_reduce_launcher(torch::Tensor input) {
   return output;
 }
 
-void fp4_quantize_launcher(const torch::Tensor& A, const torch::Tensor& absmax,
-                           torch::Tensor& out, int64_t blocksize, int64_t n) {}
\ No newline at end of file
+unsigned char dQuantizeFP4(float x) {
+  // FP4 with bias of 3
+  // first bit is a sign
+  // subnormals
+  // 0b000 = 0
+  // 0b001 = 0.0625
+  // 0b110 = 2
+  // 0b111 = 3
+  // 0b100 = 4
+  // 0b101 = 6
+  // 0b010 = 8
+  // 0b011 = 12
+
+  // we do a binary search
+  // the pivots are divided by 12 (the FP4 absmax)
+  // since we assum input data is in [-1.0, 1.0]
+
+  // !be careful here, its easy to make a mistake
+  // that is difficult to noice if you add an extra
+  // zero somewhere!
+
+  int sign = x < 0 ? 0b1000 : 0b0000;
+  x = fabsf(x);
+  if (x > 0.29166667f)
+    if (x > 0.583333f)
+      if (x > 0.8333333f)
+        return 0b0011 + sign;
+      else
+        return 0b0010 + sign;
+    else if (x > 0.4166667f)
+      return 0b101 + sign;
+    else
+      return 0b100 + sign;
+  else if (x > 0.0859375f)
+    if (x > 0.20833333f)
+      return 0b0111 + sign;
+    else
+      return 0b0110 + sign;
+  else if (x > 0.00260417f)
+    return 0b0001 + sign;
+  else
+    return 0b0000 + sign;
+}
+
+void fp4_quantize_launcher(const torch::Tensor& A, torch::Tensor& absmax,
+                           torch::Tensor& out, int64_t blocksize, int64_t n) {
+  auto blocks = absmax.sizes()[0];
+  auto src = A.data_ptr<float>();
+  auto absmax_ptr = absmax.data_ptr<float>();
+  auto out_ptr = out.data_ptr<unsigned char>();
+  for (int b = 0; b < blocks; b++) {
+    float max = -99999999999999.f;
+    size_t offset = b * blocksize;
+    for (int i = 0; i < blocksize; i++) {
+      if (offset + i >= n) break;
+      max = std::abs(src[offset + i]) > max ? std::abs(src[offset + i]) : max;
+    }
+    absmax_ptr[b] = max;
+    for (int i = 0; i < blocksize / 2; i++) {
+      unsigned char packed_4bit = 0;
+      if (offset + i * 2 >= n) break;
+      packed_4bit |= dQuantizeFP4(src[offset + 2 * i] * (1.f / max)) << 4;
+      packed_4bit |= dQuantizeFP4(src[offset + 2 * i + 1] * (1.f / max));
+      out_ptr[offset / 2 + i] = packed_4bit;
+    }
+  }
+}
\ No newline at end of file
diff --git a/tests/custom_op/pyt_all_reduce_kernel.hh b/tests/custom_op/pyt_all_reduce_kernel.hh
index d0b26fc1d..78d4e8c9b 100644
--- a/tests/custom_op/pyt_all_reduce_kernel.hh
+++ b/tests/custom_op/pyt_all_reduce_kernel.hh
@@ -5,5 +5,5 @@
 
 torch::Tensor all_reduce_launcher(torch::Tensor input);
 
-void fp4_quantize_launcher(const torch::Tensor& A, const torch::Tensor& absmax,
+void fp4_quantize_launcher(const torch::Tensor& A, torch::Tensor& absmax,
                            torch::Tensor& out, int64_t blocksize, int64_t n);
diff --git a/tests/custom_op/pyt_all_reduce_op.cpp b/tests/custom_op/pyt_all_reduce_op.cpp
index 17e6c83ca..73da05678 100644
--- a/tests/custom_op/pyt_all_reduce_op.cpp
+++ b/tests/custom_op/pyt_all_reduce_op.cpp
@@ -7,9 +7,10 @@ static torch::Tensor custom_allreduce(torch::Tensor input) {
   return all_reduce_launcher(input);
 }
 
-static void ref_fp4_quantize(const torch::Tensor& A,
-                             const torch::Tensor& absmax, torch::Tensor& out,
-                             int64_t blocksize, int64_t n) {}
+static void ref_fp4_quantize(const torch::Tensor& A, torch::Tensor& absmax,
+                             torch::Tensor& out, int64_t blocksize, int64_t n) {
+  fp4_quantize_launcher(A, absmax, out, blocksize, n);
+}
 
 // static auto registry = torch::RegisterOperators("myop::skbmm", &skbmm);
 TORCH_LIBRARY(my_ops, m) {

From b905abed03e0ed72c67a2d3adf35c4795f932c9a Mon Sep 17 00:00:00 2001
From: "Zhe,Wang" <zhe1.wang@intel.com>
Date: Wed, 26 Jul 2023 14:36:13 +0800
Subject: [PATCH 4/5] fp4 quantize ref done.

---
 bitsandbytes/functional.py |  4 ++--
 bitsandbytes/nn/modules.py |  4 ++--
 tests/fp4_ref.py           |  6 +++++-
 tests/test_functional.py   | 10 ++++++++--
 4 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
index 871915172..2802a1183 100644
--- a/bitsandbytes/functional.py
+++ b/bitsandbytes/functional.py
@@ -838,8 +838,8 @@ def quantize_4bit(A: Tensor, absmax: Tensor = None, out: Tensor = None, blocksiz
     else:
         raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {A.dtype}")
     post_call(A.device)
-    print(absmax)
-    print(out)
+    # print(absmax)
+    # print(out)
     datatype = get_4bit_type(quant_type, device=A.device)
 
     if compress_statistics:
diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py
index 8551a6def..c9526873b 100644
--- a/bitsandbytes/nn/modules.py
+++ b/bitsandbytes/nn/modules.py
@@ -162,8 +162,8 @@ def cuda(self, device):
         out = torch.zeros(1024,1, dtype=torch.uint8)
         torch.ops.load_library("/home/zhe/bitsandbytes/tests/custom_op/build/libcustom_allreduce_op.so")
         torch.ops.my_ops.ref_fp4_quantize(rw,absmax,out,64,2048)
-        print(absmax)
-        print(out)
+        # print(absmax)
+        # print(out)
 
         return self
 
diff --git a/tests/fp4_ref.py b/tests/fp4_ref.py
index 69db98ce8..baa7fffe5 100644
--- a/tests/fp4_ref.py
+++ b/tests/fp4_ref.py
@@ -1,6 +1,6 @@
 import torch
 from torch import Tensor
-def ref_quantizeblockwise_fp4(A: Tensor,absmax: Tensor,out: Tensor,blocksize=64,compress_statics=False,quant_type="fp4"):
+def ref_quantizeblockwise_fp4(A: Tensor,absmax: Tensor=None,out: Tensor=None,blocksize=64,compress_statics=False,quant_type="fp4"):
     n = A.numel()
     input_shape = A.shape
 
@@ -15,4 +15,8 @@ def ref_quantizeblockwise_fp4(A: Tensor,absmax: Tensor,out: Tensor,blocksize=64,
 
     assert blocksize in [4096, 2048, 1024, 512, 256, 128, 64]
 
+    torch.ops.load_library("/home/zhe/bitsandbytes/tests/custom_op/build/libcustom_allreduce_op.so")
+    torch.ops.my_ops.ref_fp4_quantize(A,absmax,out,64,n)
+
+    return absmax,out
     
diff --git a/tests/test_functional.py b/tests/test_functional.py
index d7212b047..3be1794c7 100644
--- a/tests/test_functional.py
+++ b/tests/test_functional.py
@@ -12,6 +12,8 @@
 from bitsandbytes import functional as F
 from scipy.stats import norm
 
+import fp4_ref
+
 torch.set_printoptions(
     precision=5, sci_mode=False, linewidth=120, edgeitems=20, threshold=10000
 )
@@ -2257,10 +2259,13 @@ def test_fp4_quant(dtype):
             result = sign*exp*frac
         code[idx] = result
 
-    A1 = torch.randn(1024, 1024, device='cuda', dtype=dtype)
+    A1 = torch.randn(1023, 1023, device='cuda', dtype=dtype)
+    A11=A1.to("cpu")
     qa, SA = F.quantize_fp4(A1, blocksize=64)
+    qaa,SAA=fp4_ref.ref_quantizeblockwise_fp4(A11,blocksize=64)
+    np.allclose(qa.cpu(),qaa,1.e-1)
+    np.allclose(SA[0].cpu(),SAA,1)
     A2 = F.dequantize_fp4(qa, SA)
-
     err = (A1 - A2).abs().float()
     relerr = (err/(A1.abs().float()+1e-8)).mean()
     idx = err > 1.0
@@ -2553,3 +2558,4 @@ def test_gemv_eye_4bit(storage_type, dtype, double_quant):
         #torch.testing.assert_close(A, C2, rtol=1e-5, atol=0.080)
 
 
+test_fp4_quant(torch.float32)
\ No newline at end of file

From 17ff13d33a84b2bfd9d561c4b029d00c18f756f6 Mon Sep 17 00:00:00 2001
From: "Zhe,Wang" <zhe1.wang@intel.com>
Date: Wed, 26 Jul 2023 16:34:56 +0800
Subject: [PATCH 5/5] fp4 dequantize ref done.

---
 tests/custom_op/pyt_all_reduce_kernel.cu | 43 ++++++++++++++++++++++++
 tests/custom_op/pyt_all_reduce_kernel.hh |  3 ++
 tests/custom_op/pyt_all_reduce_op.cpp    |  7 ++++
 tests/fp4_ref.py                         | 14 ++++++--
 tests/test_functional.py                 |  7 ++--
 5 files changed, 70 insertions(+), 4 deletions(-)

diff --git a/tests/custom_op/pyt_all_reduce_kernel.cu b/tests/custom_op/pyt_all_reduce_kernel.cu
index 6c6e6e608..0cd992907 100644
--- a/tests/custom_op/pyt_all_reduce_kernel.cu
+++ b/tests/custom_op/pyt_all_reduce_kernel.cu
@@ -45,6 +45,29 @@ torch::Tensor all_reduce_launcher(torch::Tensor input) {
   return output;
 }
 
+float dDequantizeFP4Tree(unsigned char val, float absmax) {
+  float sign = (val & 0b1000) == 8 ? -1.0f : 1.0f;
+  if ((val & 0b0100) == 4)                   // 0
+    if ((val & 0b0010) == 2)                 // 01
+      if ((val & 0b0001) == 1)               // 111
+        return 0.25000000f * absmax * sign;  // 1111
+      else
+        return 0.16666667f * absmax * sign;  // 1110
+    else if ((val & 0b0001) == 1)            // 110
+      return 0.50000000f * absmax * sign;    // 1101
+    else
+      return 0.33333333f * absmax * sign;  // 1100
+  else if ((val & 0b0010) == 2)            // 10
+    if ((val & 0b0001) == 1)               // 101
+      return 1.00000000f * absmax * sign;  // 1011
+    else
+      return 0.66666667f * absmax * sign;     // 1010
+  else if ((val & 0b0001) == 1)               // 100
+    return 5.208333333e-03f * absmax * sign;  // 1001
+  else
+    return 0.00000000f * absmax * sign;  // 1000
+}
+
 unsigned char dQuantizeFP4(float x) {
   // FP4 with bias of 3
   // first bit is a sign
@@ -111,4 +134,24 @@ void fp4_quantize_launcher(const torch::Tensor& A, torch::Tensor& absmax,
       out_ptr[offset / 2 + i] = packed_4bit;
     }
   }
+}
+
+void fp4_dequantize_launcher(const torch::Tensor& A, torch::Tensor& absmax,
+                             torch::Tensor& out, int64_t blocksize, int64_t n) {
+  auto blocks = absmax.sizes()[0];
+  auto src = A.data_ptr<unsigned char>();
+  auto absmax_ptr = absmax.data_ptr<float>();
+  auto out_ptr = out.data_ptr<float>();
+  for (int b = 0; b < blocks; b++) {
+    size_t offset = b * blocksize;
+    auto max = absmax_ptr[b];
+    for (int i = 0; i < blocksize / 2; i++) {
+      unsigned char packed_4bit = 0;
+      if (offset + i * 2 >= n) break;
+      out_ptr[offset + 2 * i] =
+          dDequantizeFP4Tree(src[offset / 2 + i] >> 4, max);
+      out_ptr[offset + 2 * i + 1] =
+          dDequantizeFP4Tree(src[offset / 2 + i] & 0x0f, max);
+    }
+  }
 }
\ No newline at end of file
diff --git a/tests/custom_op/pyt_all_reduce_kernel.hh b/tests/custom_op/pyt_all_reduce_kernel.hh
index 78d4e8c9b..dd60102be 100644
--- a/tests/custom_op/pyt_all_reduce_kernel.hh
+++ b/tests/custom_op/pyt_all_reduce_kernel.hh
@@ -7,3 +7,6 @@ torch::Tensor all_reduce_launcher(torch::Tensor input);
 
 void fp4_quantize_launcher(const torch::Tensor& A, torch::Tensor& absmax,
                            torch::Tensor& out, int64_t blocksize, int64_t n);
+
+void fp4_dequantize_launcher(const torch::Tensor& A, torch::Tensor& absmax,
+                             torch::Tensor& out, int64_t blocksize, int64_t n);
\ No newline at end of file
diff --git a/tests/custom_op/pyt_all_reduce_op.cpp b/tests/custom_op/pyt_all_reduce_op.cpp
index 73da05678..d8dfc776c 100644
--- a/tests/custom_op/pyt_all_reduce_op.cpp
+++ b/tests/custom_op/pyt_all_reduce_op.cpp
@@ -12,8 +12,15 @@ static void ref_fp4_quantize(const torch::Tensor& A, torch::Tensor& absmax,
   fp4_quantize_launcher(A, absmax, out, blocksize, n);
 }
 
+static void ref_fp4_dequantize(const torch::Tensor& A, torch::Tensor& absmax,
+                               torch::Tensor& out, int64_t blocksize,
+                               int64_t n) {
+  fp4_dequantize_launcher(A, absmax, out, blocksize, n);
+}
+
 // static auto registry = torch::RegisterOperators("myop::skbmm", &skbmm);
 TORCH_LIBRARY(my_ops, m) {
   m.def("custom_allreduce", &custom_allreduce);
   m.def("ref_fp4_quantize", &ref_fp4_quantize);
+  m.def("ref_fp4_dequantize", &ref_fp4_dequantize);
 }
diff --git a/tests/fp4_ref.py b/tests/fp4_ref.py
index baa7fffe5..d0ab2dcf5 100644
--- a/tests/fp4_ref.py
+++ b/tests/fp4_ref.py
@@ -1,5 +1,7 @@
 import torch
 from torch import Tensor
+# TODO: support double-quant for absmax
+torch.ops.load_library("/home/zhe/bitsandbytes/tests/custom_op/build/libcustom_allreduce_op.so")
 def ref_quantizeblockwise_fp4(A: Tensor,absmax: Tensor=None,out: Tensor=None,blocksize=64,compress_statics=False,quant_type="fp4"):
     n = A.numel()
     input_shape = A.shape
@@ -15,8 +17,16 @@ def ref_quantizeblockwise_fp4(A: Tensor,absmax: Tensor=None,out: Tensor=None,blo
 
     assert blocksize in [4096, 2048, 1024, 512, 256, 128, 64]
 
-    torch.ops.load_library("/home/zhe/bitsandbytes/tests/custom_op/build/libcustom_allreduce_op.so")
     torch.ops.my_ops.ref_fp4_quantize(A,absmax,out,64,n)
 
     return absmax,out
-    
+
+# TODO: support absmax-dequant 
+def ref_dequantizeblockwise_fp4(A: Tensor,out: Tensor,absmax: Tensor=None,blocksize=64,compress_statics=False,quant_type="fp4"):
+    assert(out != None)
+    shape = out.shape
+    dtype = out.dtype
+    n = out.numel()
+    torch.ops.my_ops.ref_fp4_dequantize(A,absmax,out,64,n)
+    return out
+    
\ No newline at end of file
diff --git a/tests/test_functional.py b/tests/test_functional.py
index 3be1794c7..0a574d120 100644
--- a/tests/test_functional.py
+++ b/tests/test_functional.py
@@ -2263,9 +2263,12 @@ def test_fp4_quant(dtype):
     A11=A1.to("cpu")
     qa, SA = F.quantize_fp4(A1, blocksize=64)
     qaa,SAA=fp4_ref.ref_quantizeblockwise_fp4(A11,blocksize=64)
-    np.allclose(qa.cpu(),qaa,1.e-1)
-    np.allclose(SA[0].cpu(),SAA,1)
+    # np.allclose(qa.cpu(),qaa,1.e-1)
+    # np.allclose(SA[0].cpu(),SAA,1)
     A2 = F.dequantize_fp4(qa, SA)
+    out=torch.zeros(1023,1023,dtype=dtype)
+    A22=fp4_ref.ref_dequantizeblockwise_fp4(SAA,out,qaa)
+    np.allclose(A2.cpu(),A22,0)
     err = (A1 - A2).abs().float()
     relerr = (err/(A1.abs().float()+1e-8)).mean()
     idx = err > 1.0