-
Notifications
You must be signed in to change notification settings - Fork 39
Description
I tried version2 on my 4790K(Haswell), which theoretical peak is a bit less than 512 GFLOP/s. I got ~540 GFLOP/s at 'Single-Precision - 256-bit AVX - Multiply' test, more precisely at bench_mul_f32v3_AVX_chains12(). From what I know, i should get peak/2 because this test is not using FMA. So, I spent some time building a MWE, extracting the function and just the needed code to get this running, to understand better what is happening(I have been compiling this with g++ -std=c++0x -fopenmp -O2 -march=haswell). The MWE is:
#include <string.h>
#include <stdio.h>
#include <omp.h>
#include <immintrin.h>
#include <sys/time.h>
#define flops_reduce_chains4( \
vadd, \
r0, r1, r2, r3 \
){ \
r0 = vadd(r0, r2); \
r1 = vadd(r1, r3); \
\
r0 = vadd(r0, r1); \
}
#define flops_reduce_chains8( \
vadd, \
r0, r1, r2, r3, r4, r5, r6, r7 \
){ \
r0 = vadd(r0, r4); \
r1 = vadd(r1, r5); \
r2 = vadd(r2, r6); \
r3 = vadd(r3, r7); \
\
r0 = vadd(r0, r2); \
r1 = vadd(r1, r3); \
\
r0 = vadd(r0, r1); \
}
#define flops_reduce_chains12( \
vadd, \
r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, rA, rB \
){ \
flops_reduce_chains8(vadd, r0, r1, r2, r3, r4, r5, r6, r7); \
flops_reduce_chains4(vadd, r8, r9, rA, rB); \
r0 = vadd(r0, r8); \
}
#define flops_mul_chains12_ops24( \
vmul, \
mul0, mul1, \
r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, rA, rB \
){ \
r0 = vmul(r0, mul0); \
r1 = vmul(r1, mul0); \
r2 = vmul(r2, mul0); \
r3 = vmul(r3, mul0); \
r4 = vmul(r4, mul0); \
r5 = vmul(r5, mul0); \
r6 = vmul(r6, mul0); \
r7 = vmul(r7, mul0); \
r8 = vmul(r8, mul0); \
r9 = vmul(r9, mul0); \
rA = vmul(rA, mul0); \
rB = vmul(rB, mul0); \
r0 = vmul(r0, mul1); \
r1 = vmul(r1, mul1); \
r2 = vmul(r2, mul1); \
r3 = vmul(r3, mul1); \
r4 = vmul(r4, mul1); \
r5 = vmul(r5, mul1); \
r6 = vmul(r6, mul1); \
r7 = vmul(r7, mul1); \
r8 = vmul(r8, mul1); \
r9 = vmul(r9, mul1); \
rA = vmul(rA, mul1); \
rB = vmul(rB, mul1); \
}
#define flops_mul_chains12_unroll2_ops48( \
vmul, \
mul0, mul1, \
r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, rA, rB \
){ \
flops_mul_chains12_ops24(vmul, mul0, mul1, r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, rA, rB); \
flops_mul_chains12_ops24(vmul, mul0, mul1, r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, rA, rB); \
}
const double TEST_MUL_MUL = 1.4142135623730950488;
const double TEST_MUL_DIV = 0.70710678118654752440;
inline float reduce(__m256 y){
__m128 x = _mm_add_ps(_mm256_castps256_ps128(y), _mm256_extractf128_ps(y, 1));
x = _mm_add_ps(x, _mm_unpackhi_ps(x, x));
x = _mm_add_ps(x, _mm_shuffle_ps(x, x, 1));
return _mm_cvtss_f32(x);
}
long run_loop(long iterations, double &result) {
const __m256 mul0 = _mm256_set1_ps((float)TEST_MUL_MUL);
const __m256 mul1 = _mm256_set1_ps((float)TEST_MUL_DIV);
__m256 r0 = _mm256_set1_ps(1.0f);
__m256 r1 = _mm256_set1_ps(1.1f);
__m256 r2 = _mm256_set1_ps(1.2f);
__m256 r3 = _mm256_set1_ps(1.3f);
__m256 r4 = _mm256_set1_ps(1.4f);
__m256 r5 = _mm256_set1_ps(1.5f);
__m256 r6 = _mm256_set1_ps(1.6f);
__m256 r7 = _mm256_set1_ps(1.7f);
__m256 r8 = _mm256_set1_ps(1.8f);
__m256 r9 = _mm256_set1_ps(1.9f);
__m256 rA = _mm256_set1_ps(2.0f);
__m256 rB = _mm256_set1_ps(2.1f);
for (size_t i = 0; i < iterations; i++){
flops_mul_chains12_unroll2_ops48(
_mm256_mul_ps,
mul0, mul1,
r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, rA, rB
);
}
flops_reduce_chains12(
_mm256_add_ps,
r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, rA, rB
);
result = reduce(r0);
return iterations * 8 * 48;
}
int main() {
long iterations = 1000000000;
printf( "Single-Precision - 256-bit AVX - Multiply:\n" );
printf( " Dependency Chains = 12\n" );
int threads = 8;
long thread_iterations[threads];
double thread_result[threads];
for(int i=0;i<threads;i++) {
thread_iterations[i] = 0;
thread_result[i] = 0.0;
}
struct timeval t0,t1;
gettimeofday(&t0, 0);
#pragma omp parallel num_threads((int)threads)
{
size_t thread_id = omp_get_thread_num();
double result;
thread_iterations[thread_id] = run_loop(iterations, result);
thread_result[thread_id] = result;
}
gettimeofday(&t1, 0);
double result = 0;
iterations = 0;
for (int i = 0; i < threads; i++){
result += thread_result[i];
iterations += thread_iterations[i];
}
double seconds = (double)((t1.tv_sec-t0.tv_sec)*1000000 + t1.tv_usec-t0.tv_usec)/1000000;
printf( " Result = %f\n",result );
printf( " FP Ops = %d\n",iterations );
printf( " seconds = %f\n",seconds );
printf( " GFlops = %f\n",iterations / seconds / 1.e9 );
}
This MWE give me the same ~540 GFLOP/s.
I checked assembly and saw that the loop that is doing all the work is not doing 48 ops as you intented, it is actually doing 20. If I change 48 to 20 in the operations done at the return, I get a reasonable result(~230 GFLOP/s)