diff --git a/Makefile b/Makefile index 0d6c0dc..95af413 100644 --- a/Makefile +++ b/Makefile @@ -7,10 +7,10 @@ all: libdto dto-test-wodto DML_LIB_CXX=-D_GNU_SOURCE libdto: dto.c - gcc -shared -fPIC -Wl,-soname,libdto.so dto.c $(DML_LIB_CXX) -DDTO_STATS_SUPPORT -o libdto.so.1.0 -laccel-config -ldl -lnuma + gcc -shared -fPIC -Wl,-soname,libdto.so dto.c $(DML_LIB_CXX) -DDTO_STATS_SUPPORT -o libdto.so.1.0 -laccel-config -ldl -lnuma -mwaitpkg libdto_nostats: dto.c - gcc -shared -fPIC -Wl,-soname,libdto.so dto.c $(DML_LIB_CXX) -o libdto.so.1.0 -laccel-config -ldl -lnuma + gcc -shared -fPIC -Wl,-soname,libdto.so dto.c $(DML_LIB_CXX) -o libdto.so.1.0 -laccel-config -ldl -lnuma -mwaitpkg install: cp libdto.so.1.0 /usr/lib64/ diff --git a/README.md b/README.md index c9f3fe0..7377fd5 100644 --- a/README.md +++ b/README.md @@ -26,6 +26,13 @@ To improve throughput for synchronous offload, DTO uses "pseudo asynchronous" ex calling thread cpu - cpu-centric numa awareness. 3) In parallel, DTO performs the CPU portion of the job using std library on CPU. 4) DTO waits for DSA to complete (if it hasn't completed already). The wait method can be configured using an environment variable DTO_WAIT_METHOD. + The wait method can be one of the following: yield, busypoll, umwait, or tpause. The default is busypoll. + +For some workloads, complete offloading to the DSA device can result in improved performance and reduce power consumption via the UMWAIT (or TPAUSE) instruction. +In this case, DTO_CPU_SIZE_FRACTION can be set to 0.0, which means that the CPU job is 0 bytes and the entire job is offloaded to DSA and AUTO_ADJUST_KNOBS is set to 0. +Then during the wait step, DTO will use the UMWAIT (or TPAUSE) instruction to wait for the DSA job to complete. UMWAIT / TPAUSE are instructions that allow the CPU to enter a low-power state while waiting for the job to complete. For UMWAIT, please refer to this [enabling guide](https://www.intel.com/content/www/us/en/developer/articles/technical/software-security-guidance/technical-documentation/monitor-umonitor-performance-guidance.html) for more details. + + DTO also implements a heuristic to auto tune dsa_min_bytes and cpu_size_fraction parameters based on current DSA load. For example, if DSA is heavily loaded, DTO tries to reduce the DSA load by increasing cpu_size_fraction and dsa_min_bytes. Conversely, if DSA is lightly loaded, DTO tries to increase the DSA load by @@ -179,4 +186,4 @@ When linking DTO using LD_PRELOAD environment variable special care is required in the script. - When the application is started by a script with #! which invokes another script with #!, for unknown reasons DTO causes a segmentation fault during a memset operation on an 8K sized buffer. This can be avoided by setting the minimum - DTO size above 8K, or by avoiding this invocation sequence. \ No newline at end of file + DTO size above 8K, or by avoiding this invocation sequence. diff --git a/dto.c b/dto.c index b7a3a1c..48d1be2 100644 --- a/dto.c +++ b/dto.c @@ -30,9 +30,11 @@ // DSA capabilities #define GENCAP_CC_MEMORY 0x4 -#define UMWAIT_DELAY_DEFAULT 100000 -/* C0.1 state */ -#define UMWAIT_STATE 1 +#define UMWAIT_DELAY_DEFAULT 100000 //cycles until umwait timeout + +#define C01_STATE 1 +#define C02_STATE 0 +#define TPAUSE_DELAY 1000 #define USE_ORIG_FUNC(n, use_dsa) (use_std_lib_calls == 1 || !use_dsa || n < dsa_min_size) #define TS_NS(s, e) (((e.tv_sec*1000000000) + e.tv_nsec) - ((s.tv_sec*1000000000) + s.tv_nsec)) @@ -44,10 +46,15 @@ */ #define MAX_WQS 32 #define MAX_NUMA_NODES 32 -#define DTO_DEFAULT_MIN_SIZE 16384 +#define DTO_DEFAULT_MIN_SIZE 65536 #define DTO_INITIALIZED 0 #define DTO_INITIALIZING 1 + +#define NSEC_PER_SEC (1000000000) +#define MSEC_PER_SEC (1000) +#define NSEC_PER_MSEC (NSEC_PER_SEC/MSEC_PER_SEC) + // thread specific variables static __thread struct dsa_hw_desc thr_desc; static __thread struct dsa_completion_record thr_comp __attribute__((aligned(32))); @@ -79,7 +86,8 @@ struct dto_device { enum wait_options { WAIT_BUSYPOLL = 0, WAIT_UMWAIT, - WAIT_YIELD + WAIT_YIELD, + WAIT_TPAUSE }; enum numa_aware { @@ -105,7 +113,7 @@ static atomic_uchar dto_initializing; static uint8_t use_std_lib_calls; static enum numa_aware is_numa_aware; static size_t dsa_min_size = DTO_DEFAULT_MIN_SIZE; -static int wait_method = WAIT_YIELD; +static int wait_method = WAIT_BUSYPOLL; static size_t cpu_size_fraction; // range of values is 0 to 99 static uint8_t dto_dsa_memcpy = 1; @@ -114,6 +122,18 @@ static uint8_t dto_dsa_memset = 1; static uint8_t dto_dsa_memcmp = 1; static uint8_t dto_dsa_cc = 1; +static bool dto_use_c02 = true; //C02 state is default - + //C02 avg exit latency is ~500 ns + //and C01 is about ~240 ns on SPR + +#define TPAUSE_C02_DELAY_NS 6000 //in this case we are offloading so delay can + //be ~6 us as this is around the time a > 64KB + //copy takes to complete + +#define TPAUSE_C01_DELAY_NS 1000 //keep smaller because we want to wake up + //with lower latency + +static uint64_t tpause_wait_time = TPAUSE_C02_DELAY_NS; static unsigned long dto_umwait_delay = UMWAIT_DELAY_DEFAULT; @@ -171,6 +191,7 @@ static const char * const wait_names[] = { [WAIT_BUSYPOLL] = "busypoll", [WAIT_UMWAIT] = "umwait", [WAIT_YIELD] = "yield", + [WAIT_TPAUSE] = "tpause" }; static int collect_stats; @@ -223,7 +244,7 @@ static atomic_int fail_counter[HIST_NO_BUCKETS][MAX_FAILURES]; static int init_dto(void) __attribute__((constructor)); static void cleanup_dto(void) __attribute__((destructor)); -static int umwait_support; +static int waitpkg_support; static enum { LOG_LEVEL_FATAL, @@ -323,76 +344,78 @@ static __always_inline void movdir64b(struct dsa_hw_desc *desc, volatile void *r : : "a" (reg), "d" (desc)); } -static __always_inline void umonitor(const volatile void *addr) -{ - asm volatile(".byte 0xf3, 0x48, 0x0f, 0xae, 0xf0" : : "a"(addr)); -} - -static __always_inline int umwait(unsigned long timeout, unsigned int state) -{ - uint8_t r; - uint32_t timeout_low = (uint32_t)timeout; - uint32_t timeout_high = (uint32_t)(timeout >> 32); - - asm volatile(".byte 0xf2, 0x48, 0x0f, 0xae, 0xf1\t\n" - "setc %0\t\n" - : "=r"(r) - : "c"(state), "a"(timeout_low), "d"(timeout_high)); - return r; -} - static __always_inline void dsa_wait_yield(const volatile uint8_t *comp) { while (*comp == 0) { - sched_yield(); + sched_yield(); } } static __always_inline void dsa_wait_busy_poll(const volatile uint8_t *comp) { while (*comp == 0) { - _mm_pause(); + _mm_pause(); } } -static __always_inline void __dsa_wait_umwait(const volatile uint8_t *comp) +static __always_inline void dsa_wait_tpause(const volatile uint8_t *comp) { - umonitor(comp); + while (*comp == 0) { + uint64_t delay = _rdtsc() + tpause_wait_time; + _tpause(C02_STATE, delay); + } +} - // Hardware never writes 0 to this field. Software should initialize this field to 0 - // so it can detect when the completion record has been written - if (*comp == 0) { - uint64_t delay = __rdtsc() + dto_umwait_delay; +static __always_inline void __dsa_wait_umwait(const volatile uint8_t *comp) +{ + _umonitor((void*)comp); - umwait(delay, UMWAIT_STATE); - } + uint64_t delay = _rdtsc() + UMWAIT_DELAY_DEFAULT; + _umwait(C02_STATE, delay); } static __always_inline void dsa_wait_umwait(const volatile uint8_t *comp) { - - while (*comp == 0) - __dsa_wait_umwait(comp); + while (*comp == 0) { + __dsa_wait_umwait(comp); + } } static __always_inline void __dsa_wait(const volatile uint8_t *comp) { - if (wait_method == WAIT_YIELD) + switch(wait_method) { + case WAIT_YIELD: sched_yield(); - else if (wait_method == WAIT_UMWAIT) - __dsa_wait_umwait(comp); - else - _mm_pause(); + break; + case WAIT_UMWAIT: + __dsa_wait_umwait(comp); + break; + case WAIT_TPAUSE: + _tpause( C01_STATE, _rdtsc() + TPAUSE_C01_DELAY_NS); + break; + default: + _mm_pause(); + } } static __always_inline void dsa_wait_no_adjust(const volatile uint8_t *comp) { - if (wait_method == WAIT_YIELD) - dsa_wait_yield(comp); - else if (wait_method == WAIT_UMWAIT) - dsa_wait_umwait(comp); - else - dsa_wait_busy_poll(comp); + switch (wait_method) { + case WAIT_YIELD: + dsa_wait_yield(comp); + break; + case WAIT_UMWAIT: + dsa_wait_umwait(comp); + break; + case WAIT_BUSYPOLL: + dsa_wait_busy_poll(comp); + break; + case WAIT_TPAUSE: + dsa_wait_tpause(comp); + break; + default: + dsa_wait_busy_poll(comp); + } } /* A simple auto-tuning heuristic. @@ -418,8 +441,9 @@ static __always_inline void dsa_wait_and_adjust(const volatile uint8_t *comp) uint64_t local_num_waits = 0; if ((++num_descs & DESCS_PER_RUN) != DESCS_PER_RUN) { - while (*comp == 0) + while (*comp == 0) { __dsa_wait(comp); + } return; } @@ -1097,13 +1121,13 @@ static int dsa_init(void) const char *env_str; char wq_list[256]; - /* detect umwait support */ + /* detect waitpkg support */ leaf = 7; waitpkg = 0; if (__get_cpuid(0, &leaf, unused, &waitpkg, unused + 1)) { if (waitpkg & 0x20) { - LOG_TRACE("umwait supported\n"); - umwait_support = 1; + LOG_TRACE("waitpkg supported\n"); + waitpkg_support = 1; } } @@ -1114,14 +1138,21 @@ static int dsa_init(void) min_avg_waits = MIN_AVG_POLL_WAITS; max_avg_waits = MAX_AVG_POLL_WAITS; } else if (!strncmp(env_str, wait_names[WAIT_UMWAIT], strlen(wait_names[WAIT_UMWAIT]))) { - if (umwait_support) { + if (waitpkg_support) { wait_method = WAIT_UMWAIT; /* Use the same waits as busypoll for now */ min_avg_waits = MIN_AVG_POLL_WAITS; max_avg_waits = MAX_AVG_POLL_WAITS; } else LOG_ERROR("umwait not supported. Falling back to default wait method\n"); - } + } else if (!strncmp(env_str, wait_names[WAIT_TPAUSE], strlen(wait_names[WAIT_TPAUSE]))) { + if (waitpkg_support) { + wait_method = WAIT_TPAUSE; + } else { + LOG_ERROR("tpause not supported. Falling back to busypoll\n"); + wait_method = WAIT_BUSYPOLL; + } + } } env_str = getenv("DTO_WQ_LIST"); @@ -1341,6 +1372,27 @@ static int init_dto(void) use_std_lib_calls = 1; } + // calculate the wait time for TPAUSE + if (wait_method == WAIT_TPAUSE) { + unsigned int num, den, freq; + unsigned int empty; + unsigned long long tmp; + __get_cpuid( 0x15, &den, &num, &freq, &empty ); + freq /= 1000; + LOG_TRACE( "Core Freq = %u kHz\n", freq ); + LOG_TRACE( "TSC Mult = %u\n", num ); + LOG_TRACE( "TSC Den = %u\n", den ); + freq *= num; + freq /= den; + LOG_TRACE( "CPU freq = %u kHz\n", freq ); + + LOG_TRACE( "Requested wait: %llu nsec\n", tpause_wait_time ); + tmp = tpause_wait_time; + tmp *= freq; + tpause_wait_time = tmp / NSEC_PER_MSEC; + LOG_TRACE( "Requested wait duration: %llu cycles\n", tpause_wait_time ); + } + // display configuration LOG_TRACE("log_level: %d, collect_stats: %d, use_std_lib_calls: %d, dsa_min_size: %lu, " "cpu_size_fraction: %.2f, wait_method: %s, auto_adjust_knobs: %d, numa_awareness: %s, dto_dsa_cc: %d\n",