From 28a53f4f54c042b21de4beba276034a3607e9409 Mon Sep 17 00:00:00 2001 From: "Sydir, Jerry" Date: Wed, 11 Jun 2025 18:16:40 -0700 Subject: [PATCH 1/4] Modified Default Min size to 64K Added check to exclude operations with failed DSA calls from considering by the CPU fraction auto-tuning algorithm Added documentation on common usage modes for DTO. Signed-off-by: Sydir, Jerry --- README.md | 22 ++++++++++++++++++++++ dto.c | 9 ++++++++- 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index c9f3fe0..d1ba97a 100644 --- a/README.md +++ b/README.md @@ -61,6 +61,28 @@ Following environment variables control the behavior of DTO library: DTO_LOG_LEVEL=0/1/2 controls the log level. higher value means more verbose logging (default 0). ``` +Although not the only usage models of DTO, the following are some common ones: + Latency reduction - the goal is to minimize the latency of offloaded operations. Use the following settings: + DTO_AUTO_ADJUST_KNOBS=1 (the CPU fraction setting is critical to this mode. The optimal value is dynamic so autotune algorithm nees to be enabled) + DTO_WAIT_METHOD=busypoll + + Power Reduction - the goal is to reduce power by offloading memory operations to DSA allowing the cpu core to go into a lower power state. This mode may reduce or increase the latency of operations depending on the load on DSA devices. + DTO_AUTO_ADJUST_KNOBS=0 + DTO_CPU_SIZE_FRACTION=0.0 (offload the entire operations to DSA) + DTO_WAIT_METHOD=umwait + + Cycle count Reduction - the goal is to reduce cpu cycles by offloading memory operations to DSA. This mode may reduce or increase the latency of operations depending on the load on DSA devices and on interaction with the OS scheduler and other threads. The idea is to offload operations to DSA and allow the OS to schedule other work while DSA perform the operation. + DTO_AUTO_ADJUST_KNOBS=0 + DTO_CPU_SIZE_FRACTION=0.0 (offload the entire operations to DSA) + DTO_WAIT_METHOD=yield + + Avoiding Cache polution - the goal is to avoid polluting the cache with data from the given process. + DTO_DSA_CC=0 + DTO_AUTO_ADJUST_KNOBS=0 + DTO_CPU_SIZE_FRACTION=0.0 (offload the entire operations to DSA so none of the data is pulled into cache) + DTO_WAIT_METHOD=yield or umwait (saves either cycles or power) + + ## Build Pre-requisite packages: diff --git a/dto.c b/dto.c index b7a3a1c..e5d1db4 100644 --- a/dto.c +++ b/dto.c @@ -44,7 +44,7 @@ */ #define MAX_WQS 32 #define MAX_NUMA_NODES 32 -#define DTO_DEFAULT_MIN_SIZE 16384 +#define DTO_DEFAULT_MIN_SIZE 65536 #define DTO_INITIALIZED 0 #define DTO_INITIALIZING 1 @@ -429,6 +429,13 @@ static __always_inline void dsa_wait_and_adjust(const volatile uint8_t *comp) __dsa_wait(comp); local_num_waits++; } + + // operations that have failed (mostly due to page fault) return very quickly and cause the algorithm + // to think that the the DSA operation was faster than it really was. We exclude them from the calculation. + if(*comp != DSA_COMP_SUCCESS) { + return; + } + adjust_num_descs++; adjust_num_waits += local_num_waits; From 900ea5ccadfc5c7ad0ade2c150ca70f48184acb1 Mon Sep 17 00:00:00 2001 From: jsydir Date: Thu, 12 Jun 2025 11:48:32 -0700 Subject: [PATCH 2/4] Update dto.c Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- dto.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dto.c b/dto.c index e5d1db4..2ca9266 100644 --- a/dto.c +++ b/dto.c @@ -431,7 +431,7 @@ static __always_inline void dsa_wait_and_adjust(const volatile uint8_t *comp) } // operations that have failed (mostly due to page fault) return very quickly and cause the algorithm - // to think that the the DSA operation was faster than it really was. We exclude them from the calculation. + // to think that the DSA operation was faster than it really was. We exclude them from the calculation. if(*comp != DSA_COMP_SUCCESS) { return; } From c504c137b1fc22efa312b558349d65b035f57313 Mon Sep 17 00:00:00 2001 From: jsydir Date: Thu, 12 Jun 2025 11:48:51 -0700 Subject: [PATCH 3/4] Update README.md Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index d1ba97a..43a652d 100644 --- a/README.md +++ b/README.md @@ -63,7 +63,7 @@ Following environment variables control the behavior of DTO library: Although not the only usage models of DTO, the following are some common ones: Latency reduction - the goal is to minimize the latency of offloaded operations. Use the following settings: - DTO_AUTO_ADJUST_KNOBS=1 (the CPU fraction setting is critical to this mode. The optimal value is dynamic so autotune algorithm nees to be enabled) + DTO_AUTO_ADJUST_KNOBS=1 (the CPU fraction setting is critical to this mode. The optimal value is dynamic so autotune algorithm needs to be enabled) DTO_WAIT_METHOD=busypoll Power Reduction - the goal is to reduce power by offloading memory operations to DSA allowing the cpu core to go into a lower power state. This mode may reduce or increase the latency of operations depending on the load on DSA devices. From d2d981161738163dbc8b688b14377d216740da7c Mon Sep 17 00:00:00 2001 From: "Sydir, Jerry" Date: Thu, 12 Jun 2025 14:46:35 -0700 Subject: [PATCH 4/4] added space to line 429 Signed-off-by: Sydir, Jerry --- dto.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dto.c b/dto.c index 2ca9266..477ebf2 100644 --- a/dto.c +++ b/dto.c @@ -432,7 +432,7 @@ static __always_inline void dsa_wait_and_adjust(const volatile uint8_t *comp) // operations that have failed (mostly due to page fault) return very quickly and cause the algorithm // to think that the DSA operation was faster than it really was. We exclude them from the calculation. - if(*comp != DSA_COMP_SUCCESS) { + if (*comp != DSA_COMP_SUCCESS) { return; }