diff --git a/benchmark.py b/benchmark.py
index ab4a11c..99debed 100644
--- a/benchmark.py
+++ b/benchmark.py
@@ -23,21 +23,21 @@
 #
 # model download: https://huggingface.co/BlinkDL/rwkv7-g1
 #
-args.MODEL_NAME = "/mnt/e/RWKV-Runner/models/rwkv7-g1a-0.1b-20250728-ctx4096"
-args.n_layer = 12
-args.n_embd = 768
-# args.MODEL_NAME = "/mnt/e/RWKV-Runner/models/rwkv7-g1-0.4b-20250324-ctx4096"
+# args.MODEL_NAME = "./rwkv7-g1a-0.1b-20250728-ctx4096"
+# args.n_layer = 12
+# args.n_embd = 768
+# args.MODEL_NAME = "./rwkv7-g1-0.4b-20250324-ctx4096"
 # args.n_layer = 24
 # args.n_embd = 1024
-# args.MODEL_NAME = "/mnt/e/RWKV-Runner/models/rwkv7-g1-1.5b-20250429-ctx4096"
+# args.MODEL_NAME = "./rwkv7-g1-1.5b-20250429-ctx4096"
 # args.n_layer = 24
 # args.n_embd = 2048
-# args.MODEL_NAME = "/mnt/e/RWKV-Runner/models/rwkv7-g1-2.9b-20250519-ctx4096"
+# args.MODEL_NAME = "./rwkv7-g1-2.9b-20250519-ctx4096"
 # args.n_layer = 32
 # args.n_embd = 2560
-# args.MODEL_NAME = "/mnt/e/RWKV-Runner/models/rwkv7-g0a-7.2b-20250829-ctx4096"
-# args.n_layer = 32
-# args.n_embd = 4096
+args.MODEL_NAME = "./rwkv7-g0a-7.2b-20250829-ctx4096"
+args.n_layer = 32
+args.n_embd = 4096
 
 print(f'\nUsing CUDA fp16. Loading {args.MODEL_NAME} ...\n')
 
@@ -65,7 +65,7 @@ def xprint(s):
 
 prompt = "The Eiffel tower is in the city of"
 print(prompt)
-
+torch.compiler.cudagraph_mark_step_begin()
 init_out, init_state = model.forward(tokenizer.encode(prompt), None)
 probs = F.softmax(init_out.float(), dim=-1) # compute softmax in float (more accurate)
 _, indices = torch.topk(probs, 5) # print top-5 possibilities
@@ -89,11 +89,15 @@ def xprint(s):
 
 all_tokens = []
 out_last = 0
+torch.compiler.cudagraph_mark_step_begin()
 init_out, init_state = model.forward(tokenizer.encode(prompt), None)
 out, state = init_out.clone(), copy.deepcopy(init_state)
 
-min_time = 1e10
-min_time_all = 1e10
+# min_time = 1e10
+# min_time_all = 1e10
+
+all_times = []
+
 t000 = time.perf_counter()
 for i in range(LENGTH_PER_TRIAL):
     t00 = time.perf_counter()
@@ -109,11 +113,16 @@ def xprint(s):
 
     torch.cuda.synchronize()
     t0 = time.perf_counter()
+    torch.compiler.cudagraph_mark_step_begin()
     out, state = model.forward(token, state)
+    out, state = out.clone(), copy.deepcopy(state)
     torch.cuda.synchronize()
     t1 = time.perf_counter()
-    min_time = min(min_time, t1 - t0)
-    min_time_all = min(min_time_all, t1 - t00)
+    # min_time = min(min_time, t1 - t0)
+    # min_time_all = min(min_time_all, t1 - t00)
+    all_times.append(t1 - t0)
+
+min_time=min_time_all=np.median(all_times)
 
 print(f'\n\nToken/s = {round(1/min_time,2)} (forward), {round(1/min_time_all,2)} (full) || Bandwidth = {round(active_GB/min_time,2)} GB/s || {round(time.perf_counter()-t000,3)}s')
 
diff --git a/reference/rwkv7.py b/reference/rwkv7.py
index a1edfc1..3f4cbae 100644
--- a/reference/rwkv7.py
+++ b/reference/rwkv7.py
@@ -19,13 +19,13 @@
 import torch.nn as nn
 from torch.nn import functional as F
 
-MyModule = torch.jit.ScriptModule
-MyFunction = torch.jit.script_method
-MyStatic = torch.jit.script
-# MyModule = nn.Module
-# def __nop(ob): return ob
-# MyFunction = __nop
-# MyStatic = __nop
+# MyModule = torch.jit.ScriptModule
+# MyFunction = torch.jit.script_method
+# MyStatic = torch.jit.script
+MyModule = nn.Module
+def __nop(ob): return ob
+MyFunction = __nop
+MyStatic = __nop
 
 DTYPE = torch.half
 
@@ -47,6 +47,8 @@ def forward(ctx, state, r, w, k, v, a, b):
             y = torch.empty((T, C), device=k.device, dtype=DTYPE, requires_grad=False, memory_format=torch.contiguous_format)
             torch.ops.rwkv7_state_fwd_fp16.forward(1, T, C, H, state, r, w, k, v, a, b, y)
             return y
+
+@torch.compiler.disable
 def RWKV7_OP(state, r, w, k, v, a, b):
     return WKV_7.apply(state, r, w, k, v, a, b)
 
@@ -96,7 +98,10 @@ def forward(self, idx, state, full_output=False):
         else:
             return self.forward_one(idx, state)
 
-    @MyFunction
+    @torch.compile(mode='max-autotune-no-cudagraphs')
+    # @torch.compile(mode='reduce-overhead')
+    # @torch.compile(mode='max-autotune')
+    # @MyFunction
     def forward_one(self, idx:int, state:List[torch.Tensor]):
         with torch.no_grad(): 
             z = self.z
@@ -127,7 +132,8 @@ def forward_one(self, idx:int, state:List[torch.Tensor]):
             x = x @ z['head.weight']
             return x, state
         
-    @MyFunction
+    # @torch.compile(mode='max-autotune-no-cudagraphs')
+    # @MyFunction
     def forward_seq(self, idx:List[int], state:List[torch.Tensor], full_output:bool=False):
         with torch.no_grad(): 
             z = self.z