diff --git a/benchkit/benchmark.py b/benchkit/benchmark.py
index a34e1e70..372cb556 100644
--- a/benchkit/benchmark.py
+++ b/benchkit/benchmark.py
@@ -30,14 +30,7 @@
 )
 from benchkit.utils.system import get_boot_args
 from benchkit.utils.tee import teeprint
-from benchkit.utils.types import (
-    Command,
-    Constants,
-    Environment,
-    PathType,
-    Pretty,
-    SplitCommand,
-)
+from benchkit.utils.types import Command, Constants, Environment, PathType, Pretty, SplitCommand
 from benchkit.utils.variables import list_groupby
 
 RecordKey = str
diff --git a/benchkit/cli/init.py b/benchkit/cli/init.py
index 1b3d22d2..9c71cf63 100644
--- a/benchkit/cli/init.py
+++ b/benchkit/cli/init.py
@@ -7,11 +7,7 @@
 import black
 import isort
 
-from benchkit.cli.generate import (
-    generate_benchmark,
-    generate_campaign,
-    get_gitignore_content,
-)
+from benchkit.cli.generate import generate_benchmark, generate_campaign, get_gitignore_content
 from benchkit.utils.misc import get_benchkit_temp_folder_str
 
 _DOTGIT_DIR = Path(".git")
diff --git a/benchkit/commandwrappers/javaperf.py b/benchkit/commandwrappers/javaperf.py
index 33cd4b40..48905385 100644
--- a/benchkit/commandwrappers/javaperf.py
+++ b/benchkit/commandwrappers/javaperf.py
@@ -14,11 +14,7 @@
 from typing import Dict, List, Optional
 
 from benchkit.benchmark import RecordResult, WriteRecordFileFunction
-from benchkit.commandwrappers.perf import (
-    PerfRecordWrap,
-    PerfStatWrap,
-    _perf_command_prefix,
-)
+from benchkit.commandwrappers.perf import PerfRecordWrap, PerfStatWrap, _perf_command_prefix
 from benchkit.helpers.linux import ps
 from benchkit.platforms import Platform
 from benchkit.shell.shell import shell_interactive, shell_out
diff --git a/benchkit/commandwrappers/perf.py b/benchkit/commandwrappers/perf.py
index 4d4f979c..0132da66 100644
--- a/benchkit/commandwrappers/perf.py
+++ b/benchkit/commandwrappers/perf.py
@@ -805,6 +805,31 @@ def post_run_hook_report(
         if self._report_interactive:
             shell_interactive(command=command, ignore_ret_codes=(-13,))  # ignore broken pipe error
 
+    def post_run_hook_script(
+        self,
+        experiment_results_lines: List[RecordResult],
+        record_data_dir: PathType,
+        write_record_file_fun: WriteRecordFileFunction,
+    ) -> Optional[Dict[str, Any]]:
+        """Post run hook to generate extension of result dict holding the results of perf script.
+
+        Args:
+            experiment_results_lines (List[RecordResult]): the record results.
+            record_data_dir (PathType): path to the record data directory.
+            write_record_file_fun (WriteRecordFileFunction): callback to record a file into data
+                                                             directory.
+        """
+        assert experiment_results_lines and record_data_dir
+
+        perf_data_pathname = self.latest_perf_path
+        self._chown(pathname=perf_data_pathname)
+
+        command = self._perf_script_command(perf_data_pathname=perf_data_pathname)
+
+        output = shell_out(command, print_output=False)
+
+        write_record_file_fun(file_content=output.strip(), filename="perf.script")
+
     def fetch_flamegraph(
         self,
         flamegraph_commit: str = "41fee1f99f9276008b7cd112fca19dc3ea84ac32",
@@ -1024,6 +1049,16 @@ def _perf_report_command(self, perf_data_pathname: PathType) -> SplitCommand:
 
         return command
 
+    def _perf_script_command(self, perf_data_pathname: PathType) -> SplitCommand:
+        command = [
+            self._perf_bin,
+            "script",
+            "--input",
+            f"{perf_data_pathname}",
+        ] + self.perf_report_options
+
+        return command
+
     def _fzf(
         self,
         search_dir: PathType,
diff --git a/benchkit/commandwrappers/speedupstack.py b/benchkit/commandwrappers/speedupstack.py
index 72ce6cdd..8b218b82 100644
--- a/benchkit/commandwrappers/speedupstack.py
+++ b/benchkit/commandwrappers/speedupstack.py
@@ -1,16 +1,28 @@
 # Copyright (C) 2025 Vrije Universiteit Brussel. All rights reserved.
 # SPDX-License-Identifier: MIT
 
+import pathlib
+import re
+import time
+from collections import defaultdict
 from signal import SIGCONT, SIGSTOP
-from typing import List
+from threading import Thread
+from typing import Any, Dict, List, Optional
 
+from benchkit.benchmark import RecordResult, WriteRecordFileFunction
 from benchkit.commandattachments.klockstat import Klockstat
 from benchkit.commandattachments.llcstat import Llcstat
 from benchkit.commandattachments.offcputime import Offcputime
 from benchkit.commandattachments.signal import Signal
 from benchkit.commandwrappers import CommandWrapper
+from benchkit.commandwrappers.perf import PerfRecordWrap, _perf_command_prefix
 from benchkit.commandwrappers.strace import StraceWrap
 from benchkit.dependencies.packages import PackageDependency
+from benchkit.helpers.linux import ps
+from benchkit.platforms import get_current_platform
+from benchkit.platforms.generic import Platform
+from benchkit.shell.shell import shell_out
+from benchkit.shell.shellasync import AsyncProcess
 from benchkit.utils.types import PathType
 
 
@@ -23,9 +35,18 @@ def __init__(self, libbpf_tools_dir: PathType) -> None:
         self._llcstat = Llcstat(libbpf_tools_dir)
         self._strace = StraceWrap(pid=True, summary=False, summary_only=True)
 
+        self._perf_record_lock = PerfRecordLockWrap(
+            perf_record_options=["-e", "syscalls:sys_enter_futex,syscalls:sys_exit_futex"],
+            perf_report_options=["--ns"],
+            report_file=True,
+            report_interactive=False,
+        )
+
         self._sigstop = Signal(signal_type=SIGSTOP)
         self._sigcont = Signal(signal_type=SIGCONT)
 
+        self._perf_record_lock_attachment_thread = None
+
     def command_wrappers(self):
         return []
 
@@ -36,6 +57,11 @@ def command_attachments(self):
             self._offcputime.attachment,
             self._llcstat.attachment,
             self._strace.attachment,
+            lambda process, record_data_dir: self._perf_record_lock.attach_every_thread(
+                platform=get_current_platform(),
+                process=process,
+                record_data_dir=record_data_dir,
+            ),
             self._sigcont.attachment,
         ]
 
@@ -45,6 +71,7 @@ def post_run_hooks(self):
             self._offcputime.post_run_hook,
             self._llcstat.post_run_hook,
             self._strace.post_run_hook,
+            self._perf_record_lock.post_run_hook_script,
         ]
 
     def dependencies(self) -> List[PackageDependency]:
@@ -59,3 +86,136 @@ def dependencies(self) -> List[PackageDependency]:
         deps.extend(self._llcstat.dependencies())
 
         return deps
+
+
+class PerfRecordLockWrap(PerfRecordWrap):
+    """
+    This is a command wrapper for the `perf record` utility on multi-threaded benchmarks.
+    """
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.attachment_thread = None
+        self._data_paths = {}
+
+    def attach_every_thread(self, **kwargs):
+        self.attachment_thread = Thread(target=self.attach_every_thread_worker, kwargs=kwargs)
+        self.attachment_thread.start()
+
+    def attach_every_thread_worker(
+        self,
+        process: AsyncProcess,
+        platform: Platform,
+        record_data_dir: pathlib.Path,
+        poll_ms: int = 10,
+    ):
+        """Command attachment that will attach to every thread of the wrapped process.
+
+        Args:
+            process (AsyncProcess): the process to attach perf-stat to.
+            platform (Platform): the platform where the process is running.
+            record_data_dir (pathlib.Path): the path to the record data directory of the benchmark.
+            poll_ms (int, optional): the period at which to poll the process to detect newly created
+                                     threads. Defaults to 10.
+        """
+        perf_prefix = _perf_command_prefix(perf_bin=self._perf_bin, platform=platform)
+        prefix = ["sudo"] + perf_prefix + ["record"] + self.perf_record_options + ["-t"]
+
+        tids2perf_cmd = {}
+
+        while not process.is_finished():
+            current_tids = ps.get_threads_of_process(pid=process.pid, ignore_any_error_code=True)
+            for tid in current_tids:
+                if tid not in tids2perf_cmd:
+                    perf_data_pathname = record_data_dir / f"perf-record-lock-val-tid-{tid}.txt"
+                    self._data_paths[tid] = perf_data_pathname
+                    tids2perf_cmd[tid] = Thread(
+                        target=self.attach_on_thread_worker,
+                        kwargs={
+                            "prefix": prefix,
+                            "tid": tid,
+                            "perf_data_pathname": perf_data_pathname,
+                        },
+                    )
+                    tids2perf_cmd[tid].start()
+
+            time.sleep(poll_ms / 1000)
+
+        for current_process in tids2perf_cmd.values():
+            try:
+                current_process.join()
+            except AsyncProcess.AsyncProcessError:
+                pass
+
+    def attach_on_thread_worker(
+        self, prefix: list[str], tid: int, perf_data_pathname: pathlib.Path
+    ):
+        cmd = prefix + [f"{tid}", "--output", f"{perf_data_pathname}"]
+        self.platform.comm.shell(command=cmd)
+
+    def post_run_hook_script(
+        self,
+        experiment_results_lines: List[RecordResult],
+        record_data_dir: PathType,
+        write_record_file_fun: WriteRecordFileFunction,
+    ) -> Optional[Dict[str, Any]]:
+        """Post run hook to generate extension of result dict holding the results of perf report.
+
+        Args:
+            experiment_results_lines (List[RecordResult]): the record results.
+            record_data_dir (PathType): path to the record data directory.
+            write_record_file_fun (WriteRecordFileFunction): callback to record a file into data
+                                                             directory.
+        """
+        assert experiment_results_lines and record_data_dir
+        assert self.attachment_thread is not None
+
+        self.attachment_thread.join()
+
+        row_re = re.compile(r"^\s*(\S+)\s+(\d+)\s+\[(\d+)\]\s+(\S+):\s+(\S+):\s+(.*)\s*$")
+
+        wait_start: Dict[int, int] = {}
+        total_wait_time: int = 0
+        total_wait_time_per_thread: Dict[int, int] = defaultdict(lambda: 0)
+
+        for tid_of_file, data_path in self._data_paths.items():
+            self._chown(pathname=data_path)
+            command = self._perf_script_command(perf_data_pathname=data_path)
+            script_file = (
+                record_data_dir / pathlib.Path(f"perf-record-lock-tid-{tid_of_file}.script")
+            ).as_posix()
+
+            output = shell_out(command, print_output=False)
+
+            write_record_file_fun(file_content=output.strip(), filename=script_file)
+
+            for line in output.splitlines():
+                line = line.rstrip()
+                m = row_re.search(line)
+                if m:
+                    # comm = m.group(1)
+                    tid = int(m.group(2))
+                    # cpu = int(m.group(3))
+                    timestamp = int(round(float(m.group(4)) * 1e9))
+                    event = m.group(5)
+                    # data = m.group(6)
+
+                    if event == "syscalls:sys_enter_futex":
+                        wait_start[tid] = timestamp
+
+                    if event == "syscalls:sys_exit_futex":
+                        if tid in wait_start:
+                            wait_time_ns = timestamp - wait_start[tid]
+                            total_wait_time += wait_time_ns
+                            total_wait_time_per_thread[tid] += wait_time_ns
+                            del wait_start[tid]
+
+        self._data_paths = {}
+        return {
+            "perf_record_lock_total_wait_ns": total_wait_time,
+            "perf_record_lock_thread_avg_wait_ns": (
+                (total_wait_time / len(total_wait_time_per_thread))
+                if len(total_wait_time_per_thread) != 0
+                else 0
+            ),
+        }
diff --git a/benchkit/helpers/linux/kernel.py b/benchkit/helpers/linux/kernel.py
index 7070bd53..a82d2f34 100644
--- a/benchkit/helpers/linux/kernel.py
+++ b/benchkit/helpers/linux/kernel.py
@@ -8,12 +8,7 @@
 import pathlib
 from typing import Dict, Iterable, List
 
-from benchkit.helpers.linux.build import (
-    KernelEntry,
-    LinuxBuild,
-    Option,
-    configure_standard_kernel,
-)
+from benchkit.helpers.linux.build import KernelEntry, LinuxBuild, Option, configure_standard_kernel
 from benchkit.shell.shell import shell_out
 from benchkit.utils.types import PathType
 
diff --git a/benchkit/helpers/linux/ps.py b/benchkit/helpers/linux/ps.py
index 37a20179..f9901b11 100644
--- a/benchkit/helpers/linux/ps.py
+++ b/benchkit/helpers/linux/ps.py
@@ -10,7 +10,7 @@
 from benchkit.shell.shell import shell_out
 
 
-def get_threads_of_process(pid: int) -> List[int]:
+def get_threads_of_process(pid: int, ignore_any_error_code: bool = False) -> List[int]:
     """Get thread identifiers (TIDs) of the given process identifier (PID).
 
     Args:
@@ -26,6 +26,7 @@ def get_threads_of_process(pid: int) -> List[int]:
         f"ps -T -p {pid}",
         print_input=False,
         print_output=False,
+        ignore_any_error_code=ignore_any_error_code,
     )
 
     tids = []
diff --git a/benchkit/hooks/stressNg.py b/benchkit/hooks/stressNg.py
index ac74c662..5780fc6f 100644
--- a/benchkit/hooks/stressNg.py
+++ b/benchkit/hooks/stressNg.py
@@ -3,12 +3,7 @@
 
 from typing import List, Optional
 
-from benchkit.benchmark import (
-    PostRunHook,
-    PreRunHook,
-    RecordResult,
-    WriteRecordFileFunction,
-)
+from benchkit.benchmark import PostRunHook, PreRunHook, RecordResult, WriteRecordFileFunction
 from benchkit.platforms import get_current_platform
 from benchkit.shell.shellasync import shell_async
 from benchkit.utils.types import PathType
diff --git a/benchkit/platforms/generic.py b/benchkit/platforms/generic.py
index 2a1d68bb..4c233b00 100644
--- a/benchkit/platforms/generic.py
+++ b/benchkit/platforms/generic.py
@@ -8,11 +8,7 @@
 
 from benchkit.communication import CommunicationLayer
 from benchkit.platforms import evenorder
-from benchkit.platforms.utils import (
-    get_nb_cpus_active,
-    get_nb_cpus_isolated,
-    get_nb_cpus_total,
-)
+from benchkit.platforms.utils import get_nb_cpus_active, get_nb_cpus_isolated, get_nb_cpus_total
 from benchkit.utils import lscpu
 
 
diff --git a/benchkit/sharedlibs/assignlib.py b/benchkit/sharedlibs/assignlib.py
index de11dbd6..65ff9890 100644
--- a/benchkit/sharedlibs/assignlib.py
+++ b/benchkit/sharedlibs/assignlib.py
@@ -8,11 +8,7 @@
 import os
 from typing import Iterable, Optional, Tuple
 
-from benchkit.sharedlibs import (
-    EnvironmentVariables,
-    FromSourceSharedLib,
-    LdPreloadLibraries,
-)
+from benchkit.sharedlibs import EnvironmentVariables, FromSourceSharedLib, LdPreloadLibraries
 from benchkit.shell.shell import shell_out
 from benchkit.utils.dir import (
     caller_file_abs_path,
diff --git a/benchkit/sharedlibs/tiltlib.py b/benchkit/sharedlibs/tiltlib.py
index 6b23ce63..944d522e 100644
--- a/benchkit/sharedlibs/tiltlib.py
+++ b/benchkit/sharedlibs/tiltlib.py
@@ -10,11 +10,7 @@
 from typing import Tuple
 
 from benchkit.platforms import Platform
-from benchkit.sharedlibs import (
-    EnvironmentVariables,
-    FromSourceSharedLib,
-    LdPreloadLibraries,
-)
+from benchkit.sharedlibs import EnvironmentVariables, FromSourceSharedLib, LdPreloadLibraries
 
 
 def cmake_configure_build(
diff --git a/examples/cameracampaign/camera_campaign.py b/examples/cameracampaign/camera_campaign.py
index 99653261..f69c20de 100644
--- a/examples/cameracampaign/camera_campaign.py
+++ b/examples/cameracampaign/camera_campaign.py
@@ -6,10 +6,7 @@
 from typing import Any, Dict, Iterable, List
 
 from pythainer.examples.builders import get_user_gui_builder
-from pythainer.examples.installs import (
-    opencv_lib_install_from_src,
-    realsense2_lib_install_from_src,
-)
+from pythainer.examples.installs import opencv_lib_install_from_src, realsense2_lib_install_from_src
 from pythainer.examples.runners import camera_runner, gui_runner, personal_runner
 from pythainer.runners import ConcreteDockerRunner
 
diff --git a/examples/ipc/ipc.py b/examples/ipc/ipc.py
index f102abbb..50a65604 100644
--- a/examples/ipc/ipc.py
+++ b/examples/ipc/ipc.py
@@ -161,10 +161,7 @@ def main() -> None:
 
     match target:
         case Target.HARMONY:
-            from benchkit.devices.hdc import (
-                OpenHarmonyCommLayer,
-                OpenHarmonyDeviceConnector,
-            )
+            from benchkit.devices.hdc import OpenHarmonyCommLayer, OpenHarmonyDeviceConnector
 
             bench_dir = "/data/local/tmp"
             device = list(OpenHarmonyDeviceConnector.query_devices())[0]
diff --git a/examples/rocksdb/README.md b/examples/rocksdb/README.md
index c6c0240c..38202ed8 100644
--- a/examples/rocksdb/README.md
+++ b/examples/rocksdb/README.md
@@ -55,7 +55,10 @@ cd ../../..
 
 Running the speedup stack campaign.
 ```
+sudo -v
+while true; do sudo -v; sleep 60; done &
 ./campaign_rocksdb_speedup_stacks.py
+kill %1
 ```
 
 
@@ -88,9 +91,9 @@ sudo setcap cap_sys_resource,cap_sys_admin+eip ./klockstat
 sudo setcap cap_sys_resource,cap_sys_admin+eip ./offcputime
 sudo setcap cap_sys_resource,cap_sys_admin+eip ./llcstat
 sudo setcap cap_sys_ptrace+ep /usr/bin/strace
-kill %1
 cd ../../..
 ./campaign_rocksdb_speedup_stacks.py
+kill %1
 ```
 
 
diff --git a/examples/rocksdb/campaign_rocksdb_speedup_stacks.py b/examples/rocksdb/campaign_rocksdb_speedup_stacks.py
index ee7dddcc..a6c8508c 100755
--- a/examples/rocksdb/campaign_rocksdb_speedup_stacks.py
+++ b/examples/rocksdb/campaign_rocksdb_speedup_stacks.py
@@ -8,12 +8,16 @@
 from rocksdb import rocksdb_campaign
 
 from benchkit.campaign import CampaignSuite
+from benchkit.commandwrappers.perf import enable_non_sudo_perf
 from benchkit.commandwrappers.speedupstack import SpeedupStackWrapper
+from benchkit.platforms import get_current_platform
 from benchkit.utils.dir import get_curdir
 
 
 def main() -> None:
     """Main function of the campaign script."""
+    platform = get_current_platform()
+    enable_non_sudo_perf(comm_layer=platform.comm)
 
     rocksdb_src_dir = (get_curdir(__file__) / "deps/rocksdb/").resolve()
     libbpf_tools_dir = (get_curdir(__file__) / "deps/bcc/libbpf-tools/").resolve()
@@ -22,10 +26,16 @@ def main() -> None:
 
     campaign = rocksdb_campaign(
         src_dir=rocksdb_src_dir,
-        bench_name=["readrandom"],
+        bench_name=[
+            "readrandom",
+            # "readmissing",
+            # "seekrandom",
+            # "multireadrandom",
+            # "readwhilewriting",
+        ],
         nb_runs=5,
-        benchmark_duration_seconds=3,
-        nb_threads=[2, 4, 8],
+        benchmark_duration_seconds=10,
+        nb_threads=[1, 2, 4, 8],
         command_wrappers=([speedupstackwrapper] + speedupstackwrapper.command_wrappers()),
         command_attachments=speedupstackwrapper.command_attachments(),
         post_run_hooks=speedupstackwrapper.post_run_hooks(),
@@ -78,6 +88,22 @@ def main() -> None:
         hue="bench_name",
     )
 
+    suite.generate_graph(
+        title="Perf Record Lock",
+        plot_name="lineplot",
+        x="nb_threads",
+        y="perf_record_lock_total_wait_ns",
+        hue="bench_name",
+    )
+
+    suite.generate_graph(
+        title="Perf Record Lock Thread Avg",
+        plot_name="lineplot",
+        x="nb_threads",
+        y="perf_record_lock_thread_avg_wait_ns",
+        hue="bench_name",
+    )
+
 
 if __name__ == "__main__":
     main()
diff --git a/examples/rocksdb/kit/rocksdb.py b/examples/rocksdb/kit/rocksdb.py
index fa531932..e90304f5 100644
--- a/examples/rocksdb/kit/rocksdb.py
+++ b/examples/rocksdb/kit/rocksdb.py
@@ -11,13 +11,7 @@
 import re
 from typing import Any, Dict, Iterable, List
 
-from benchkit.benchmark import (
-    Benchmark,
-    CommandAttachment,
-    CommandWrapper,
-    PostRunHook,
-    PreRunHook,
-)
+from benchkit.benchmark import Benchmark, CommandAttachment, CommandWrapper, PostRunHook, PreRunHook
 from benchkit.campaign import CampaignCartesianProduct
 from benchkit.dependencies.packages import PackageDependency
 from benchkit.platforms import Platform
diff --git a/tests/test_commlayer.py b/tests/test_commlayer.py
index c39269c9..b7f50bde 100644
--- a/tests/test_commlayer.py
+++ b/tests/test_commlayer.py
@@ -23,10 +23,7 @@ def main() -> None:
             platform = get_current_platform()
             print(platform)
         case Target.HARMONY:
-            from benchkit.devices.hdc import (
-                OpenHarmonyCommLayer,
-                OpenHarmonyDeviceConnector,
-            )
+            from benchkit.devices.hdc import OpenHarmonyCommLayer, OpenHarmonyDeviceConnector
 
             device = list(OpenHarmonyDeviceConnector.query_devices())[0]
             hdc = OpenHarmonyDeviceConnector.from_device(device)
diff --git a/tests/test_logging.py b/tests/test_logging.py
index 09734d95..54e27df9 100644
--- a/tests/test_logging.py
+++ b/tests/test_logging.py
@@ -13,11 +13,7 @@
 import logging
 from pathlib import Path
 
-from benchkit.utils.logging import (
-    bkpprint,
-    bkprint,
-    configure_logging,
-)
+from benchkit.utils.logging import bkpprint, bkprint, configure_logging
 
 
 def main() -> None: