open-s4c · theodegeest · Feb 23, 2026 · Feb 23, 2026 · Feb 23, 2026
diff --git a/benchkit/benchmark.py b/benchkit/benchmark.py
@@ -30,14 +30,7 @@
 )
 from benchkit.utils.system import get_boot_args
 from benchkit.utils.tee import teeprint
-from benchkit.utils.types import (
-    Command,
-    Constants,
-    Environment,
-    PathType,
-    Pretty,
-    SplitCommand,
-)
+from benchkit.utils.types import Command, Constants, Environment, PathType, Pretty, SplitCommand
 from benchkit.utils.variables import list_groupby
 
 RecordKey = str

diff --git a/benchkit/cli/init.py b/benchkit/cli/init.py
@@ -7,11 +7,7 @@
 import black
 import isort
 
-from benchkit.cli.generate import (
-    generate_benchmark,
-    generate_campaign,
-    get_gitignore_content,
-)
+from benchkit.cli.generate import generate_benchmark, generate_campaign, get_gitignore_content
 from benchkit.utils.misc import get_benchkit_temp_folder_str
 
 _DOTGIT_DIR = Path(".git")

diff --git a/benchkit/commandwrappers/javaperf.py b/benchkit/commandwrappers/javaperf.py
@@ -14,11 +14,7 @@
 from typing import Dict, List, Optional
 
 from benchkit.benchmark import RecordResult, WriteRecordFileFunction
-from benchkit.commandwrappers.perf import (
-    PerfRecordWrap,
-    PerfStatWrap,
-    _perf_command_prefix,
-)
+from benchkit.commandwrappers.perf import PerfRecordWrap, PerfStatWrap, _perf_command_prefix
 from benchkit.helpers.linux import ps
 from benchkit.platforms import Platform
 from benchkit.shell.shell import shell_interactive, shell_out

diff --git a/benchkit/commandwrappers/perf.py b/benchkit/commandwrappers/perf.py
@@ -805,6 +805,31 @@ def post_run_hook_report(
         if self._report_interactive:
             shell_interactive(command=command, ignore_ret_codes=(-13,))  # ignore broken pipe error
 
+    def post_run_hook_script(
+        self,
+        experiment_results_lines: List[RecordResult],
+        record_data_dir: PathType,
+        write_record_file_fun: WriteRecordFileFunction,
+    ) -> Optional[Dict[str, Any]]:
+        """Post run hook to generate extension of result dict holding the results of perf script.
+
+        Args:
+            experiment_results_lines (List[RecordResult]): the record results.
+            record_data_dir (PathType): path to the record data directory.
+            write_record_file_fun (WriteRecordFileFunction): callback to record a file into data
+                                                             directory.
+        """
+        assert experiment_results_lines and record_data_dir
+
+        perf_data_pathname = self.latest_perf_path
+        self._chown(pathname=perf_data_pathname)
+
+        command = self._perf_script_command(perf_data_pathname=perf_data_pathname)
+
+        output = shell_out(command, print_output=False)
+
+        write_record_file_fun(file_content=output.strip(), filename="perf.script")
+
     def fetch_flamegraph(
         self,
         flamegraph_commit: str = "41fee1f99f9276008b7cd112fca19dc3ea84ac32",
@@ -1024,6 +1049,16 @@ def _perf_report_command(self, perf_data_pathname: PathType) -> SplitCommand:
 
         return command
 
+    def _perf_script_command(self, perf_data_pathname: PathType) -> SplitCommand:
+        command = [
+            self._perf_bin,
+            "script",
+            "--input",
+            f"{perf_data_pathname}",
+        ] + self.perf_report_options
+
+        return command
+
     def _fzf(
         self,
         search_dir: PathType,

diff --git a/benchkit/commandwrappers/speedupstack.py b/benchkit/commandwrappers/speedupstack.py
@@ -1,16 +1,28 @@
 # Copyright (C) 2025 Vrije Universiteit Brussel. All rights reserved.
 # SPDX-License-Identifier: MIT
 
+import pathlib
+import re
+import time
+from collections import defaultdict
 from signal import SIGCONT, SIGSTOP
-from typing import List
+from threading import Thread
+from typing import Any, Dict, List, Optional
 
+from benchkit.benchmark import RecordResult, WriteRecordFileFunction
 from benchkit.commandattachments.klockstat import Klockstat
 from benchkit.commandattachments.llcstat import Llcstat
 from benchkit.commandattachments.offcputime import Offcputime
 from benchkit.commandattachments.signal import Signal
 from benchkit.commandwrappers import CommandWrapper
+from benchkit.commandwrappers.perf import PerfRecordWrap, _perf_command_prefix
 from benchkit.commandwrappers.strace import StraceWrap
 from benchkit.dependencies.packages import PackageDependency
+from benchkit.helpers.linux import ps
+from benchkit.platforms import get_current_platform
+from benchkit.platforms.generic import Platform
+from benchkit.shell.shell import shell_out
+from benchkit.shell.shellasync import AsyncProcess
 from benchkit.utils.types import PathType
 
 
@@ -23,9 +35,18 @@ def __init__(self, libbpf_tools_dir: PathType) -> None:
         self._llcstat = Llcstat(libbpf_tools_dir)
         self._strace = StraceWrap(pid=True, summary=False, summary_only=True)
 
+        self._perf_record_lock = PerfRecordLockWrap(
+            perf_record_options=["-e", "syscalls:sys_enter_futex,syscalls:sys_exit_futex"],
+            perf_report_options=["--ns"],
+            report_file=True,
+            report_interactive=False,
+        )
+
         self._sigstop = Signal(signal_type=SIGSTOP)
         self._sigcont = Signal(signal_type=SIGCONT)
 
+        self._perf_record_lock_attachment_thread = None
+
     def command_wrappers(self):
         return []
 
@@ -36,6 +57,11 @@ def command_attachments(self):
             self._offcputime.attachment,
             self._llcstat.attachment,
             self._strace.attachment,
+            lambda process, record_data_dir: self._perf_record_lock.attach_every_thread(
+                platform=get_current_platform(),
+                process=process,
+                record_data_dir=record_data_dir,
+            ),
             self._sigcont.attachment,
         ]
 
@@ -45,6 +71,7 @@ def post_run_hooks(self):
             self._offcputime.post_run_hook,
             self._llcstat.post_run_hook,
             self._strace.post_run_hook,
+            self._perf_record_lock.post_run_hook_script,
         ]
 
     def dependencies(self) -> List[PackageDependency]:
@@ -59,3 +86,136 @@ def dependencies(self) -> List[PackageDependency]:
         deps.extend(self._llcstat.dependencies())
 
         return deps
+
+
+class PerfRecordLockWrap(PerfRecordWrap):
+    """
+    This is a command wrapper for the `perf record` utility on multi-threaded benchmarks.
+    """
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.attachment_thread = None
+        self._data_paths = {}
+
+    def attach_every_thread(self, **kwargs):
+        self.attachment_thread = Thread(target=self.attach_every_thread_worker, kwargs=kwargs)
+        self.attachment_thread.start()
+
+    def attach_every_thread_worker(
+        self,
+        process: AsyncProcess,
+        platform: Platform,
+        record_data_dir: pathlib.Path,
+        poll_ms: int = 10,
+    ):
+        """Command attachment that will attach to every thread of the wrapped process.
+
+        Args:
+            process (AsyncProcess): the process to attach perf-stat to.
+            platform (Platform): the platform where the process is running.
+            record_data_dir (pathlib.Path): the path to the record data directory of the benchmark.
+            poll_ms (int, optional): the period at which to poll the process to detect newly created
+                                     threads. Defaults to 10.
+        """
+        perf_prefix = _perf_command_prefix(perf_bin=self._perf_bin, platform=platform)
+        prefix = ["sudo"] + perf_prefix + ["record"] + self.perf_record_options + ["-t"]
+
+        tids2perf_cmd = {}
+
+        while not process.is_finished():
+            current_tids = ps.get_threads_of_process(pid=process.pid, ignore_any_error_code=True)
+            for tid in current_tids:
+                if tid not in tids2perf_cmd:
+                    perf_data_pathname = record_data_dir / f"perf-record-lock-val-tid-{tid}.txt"
+                    self._data_paths[tid] = perf_data_pathname
+                    tids2perf_cmd[tid] = Thread(
+                        target=self.attach_on_thread_worker,
+                        kwargs={
+                            "prefix": prefix,
+                            "tid": tid,
+                            "perf_data_pathname": perf_data_pathname,
+                        },
+                    )
+                    tids2perf_cmd[tid].start()
+
+            time.sleep(poll_ms / 1000)
+
+        for current_process in tids2perf_cmd.values():
+            try:
+                current_process.join()
+            except AsyncProcess.AsyncProcessError:
+                pass
+
+    def attach_on_thread_worker(
+        self, prefix: list[str], tid: int, perf_data_pathname: pathlib.Path
+    ):
+        cmd = prefix + [f"{tid}", "--output", f"{perf_data_pathname}"]
+        self.platform.comm.shell(command=cmd)
+
+    def post_run_hook_script(
+        self,
+        experiment_results_lines: List[RecordResult],
+        record_data_dir: PathType,
+        write_record_file_fun: WriteRecordFileFunction,
+    ) -> Optional[Dict[str, Any]]:
+        """Post run hook to generate extension of result dict holding the results of perf report.
+
+        Args:
+            experiment_results_lines (List[RecordResult]): the record results.
+            record_data_dir (PathType): path to the record data directory.
+            write_record_file_fun (WriteRecordFileFunction): callback to record a file into data
+                                                             directory.
+        """
+        assert experiment_results_lines and record_data_dir
+        assert self.attachment_thread is not None
+
+        self.attachment_thread.join()
+
+        row_re = re.compile(r"^\s*(\S+)\s+(\d+)\s+\[(\d+)\]\s+(\S+):\s+(\S+):\s+(.*)\s*$")
+
+        wait_start: Dict[int, int] = {}
+        total_wait_time: int = 0
+        total_wait_time_per_thread: Dict[int, int] = defaultdict(lambda: 0)
+
+        for tid_of_file, data_path in self._data_paths.items():
+            self._chown(pathname=data_path)
+            command = self._perf_script_command(perf_data_pathname=data_path)
+            script_file = (
+                record_data_dir / pathlib.Path(f"perf-record-lock-tid-{tid_of_file}.script")
+            ).as_posix()
+
+            output = shell_out(command, print_output=False)
+
+            write_record_file_fun(file_content=output.strip(), filename=script_file)
+
+            for line in output.splitlines():
+                line = line.rstrip()
+                m = row_re.search(line)
+                if m:
+                    # comm = m.group(1)
+                    tid = int(m.group(2))
+                    # cpu = int(m.group(3))
+                    timestamp = int(round(float(m.group(4)) * 1e9))
+                    event = m.group(5)
+                    # data = m.group(6)
+
+                    if event == "syscalls:sys_enter_futex":
+                        wait_start[tid] = timestamp
+
+                    if event == "syscalls:sys_exit_futex":
+                        if tid in wait_start:
+                            wait_time_ns = timestamp - wait_start[tid]
+                            total_wait_time += wait_time_ns
+                            total_wait_time_per_thread[tid] += wait_time_ns
+                            del wait_start[tid]
+
+        self._data_paths = {}
+        return {
+            "perf_record_lock_total_wait_ns": total_wait_time,
+            "perf_record_lock_thread_avg_wait_ns": (
+                (total_wait_time / len(total_wait_time_per_thread))
+                if len(total_wait_time_per_thread) != 0
+                else 0
+            ),
+        }
diff --git a/benchkit/helpers/linux/kernel.py b/benchkit/helpers/linux/kernel.py
@@ -8,12 +8,7 @@
 import pathlib
 from typing import Dict, Iterable, List
 
-from benchkit.helpers.linux.build import (
-    KernelEntry,
-    LinuxBuild,
-    Option,
-    configure_standard_kernel,
-)
+from benchkit.helpers.linux.build import KernelEntry, LinuxBuild, Option, configure_standard_kernel
 from benchkit.shell.shell import shell_out
 from benchkit.utils.types import PathType
 

diff --git a/benchkit/helpers/linux/ps.py b/benchkit/helpers/linux/ps.py
@@ -10,7 +10,7 @@
 from benchkit.shell.shell import shell_out
 
 
-def get_threads_of_process(pid: int) -> List[int]:
+def get_threads_of_process(pid: int, ignore_any_error_code: bool = False) -> List[int]:
     """Get thread identifiers (TIDs) of the given process identifier (PID).
 
     Args:
@@ -26,6 +26,7 @@ def get_threads_of_process(pid: int) -> List[int]:
         f"ps -T -p {pid}",
         print_input=False,
         print_output=False,
+        ignore_any_error_code=ignore_any_error_code,
     )
 
     tids = []

diff --git a/benchkit/hooks/stressNg.py b/benchkit/hooks/stressNg.py
@@ -3,12 +3,7 @@
 
 from typing import List, Optional
 
-from benchkit.benchmark import (
-    PostRunHook,
-    PreRunHook,
-    RecordResult,
-    WriteRecordFileFunction,
-)
+from benchkit.benchmark import PostRunHook, PreRunHook, RecordResult, WriteRecordFileFunction
 from benchkit.platforms import get_current_platform
 from benchkit.shell.shellasync import shell_async
 from benchkit.utils.types import PathType

diff --git a/benchkit/platforms/generic.py b/benchkit/platforms/generic.py
@@ -8,11 +8,7 @@
 
 from benchkit.communication import CommunicationLayer
 from benchkit.platforms import evenorder
-from benchkit.platforms.utils import (
-    get_nb_cpus_active,
-    get_nb_cpus_isolated,
-    get_nb_cpus_total,
-)
+from benchkit.platforms.utils import get_nb_cpus_active, get_nb_cpus_isolated, get_nb_cpus_total
 from benchkit.utils import lscpu
 
 

diff --git a/benchkit/sharedlibs/assignlib.py b/benchkit/sharedlibs/assignlib.py
@@ -8,11 +8,7 @@
 import os
 from typing import Iterable, Optional, Tuple
 
-from benchkit.sharedlibs import (
-    EnvironmentVariables,
-    FromSourceSharedLib,
-    LdPreloadLibraries,
-)
+from benchkit.sharedlibs import EnvironmentVariables, FromSourceSharedLib, LdPreloadLibraries
 from benchkit.shell.shell import shell_out
 from benchkit.utils.dir import (
     caller_file_abs_path,

diff --git a/benchkit/sharedlibs/tiltlib.py b/benchkit/sharedlibs/tiltlib.py
@@ -10,11 +10,7 @@
 from typing import Tuple
 
 from benchkit.platforms import Platform
-from benchkit.sharedlibs import (
-    EnvironmentVariables,
-    FromSourceSharedLib,
-    LdPreloadLibraries,
-)
+from benchkit.sharedlibs import EnvironmentVariables, FromSourceSharedLib, LdPreloadLibraries
 
 
 def cmake_configure_build(

diff --git a/examples/cameracampaign/camera_campaign.py b/examples/cameracampaign/camera_campaign.py
@@ -6,10 +6,7 @@
 from typing import Any, Dict, Iterable, List
 
 from pythainer.examples.builders import get_user_gui_builder
-from pythainer.examples.installs import (
-    opencv_lib_install_from_src,
-    realsense2_lib_install_from_src,
-)
+from pythainer.examples.installs import opencv_lib_install_from_src, realsense2_lib_install_from_src
 from pythainer.examples.runners import camera_runner, gui_runner, personal_runner
 from pythainer.runners import ConcreteDockerRunner