Skip to content

Compiler dirty tricks on version2 may report wrong results #20

@Dr-Noob

Description

@Dr-Noob

I tried version2 on my 4790K(Haswell), which theoretical peak is a bit less than 512 GFLOP/s. I got ~540 GFLOP/s at 'Single-Precision - 256-bit AVX - Multiply' test, more precisely at bench_mul_f32v3_AVX_chains12(). From what I know, i should get peak/2 because this test is not using FMA. So, I spent some time building a MWE, extracting the function and just the needed code to get this running, to understand better what is happening(I have been compiling this with g++ -std=c++0x -fopenmp -O2 -march=haswell). The MWE is:

#include <string.h>
#include <stdio.h>
#include <omp.h>
#include <immintrin.h>
#include <sys/time.h>

#define flops_reduce_chains4(   \
    vadd,   \
    r0, r1, r2, r3 \
){  \
    r0 = vadd(r0, r2);   \
    r1 = vadd(r1, r3);   \
    \
    r0 = vadd(r0, r1);   \
}
#define flops_reduce_chains8(   \
    vadd,   \
    r0, r1, r2, r3, r4, r5, r6, r7 \
){  \
    r0 = vadd(r0, r4);   \
    r1 = vadd(r1, r5);   \
    r2 = vadd(r2, r6);   \
    r3 = vadd(r3, r7);   \
    \
    r0 = vadd(r0, r2);   \
    r1 = vadd(r1, r3);   \
    \
    r0 = vadd(r0, r1);   \
}

#define flops_reduce_chains12(  \
    vadd,   \
    r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, rA, rB \
){  \
    flops_reduce_chains8(vadd, r0, r1, r2, r3, r4, r5, r6, r7); \
    flops_reduce_chains4(vadd, r8, r9, rA, rB); \
    r0 = vadd(r0, r8);   \
}

#define flops_mul_chains12_ops24(   \
    vmul,   \
    mul0, mul1,  \
    r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, rA, rB \
){  \
    r0 = vmul(r0, mul0); \
    r1 = vmul(r1, mul0); \
    r2 = vmul(r2, mul0); \
    r3 = vmul(r3, mul0); \
    r4 = vmul(r4, mul0); \
    r5 = vmul(r5, mul0); \
    r6 = vmul(r6, mul0); \
    r7 = vmul(r7, mul0); \
    r8 = vmul(r8, mul0); \
    r9 = vmul(r9, mul0); \
    rA = vmul(rA, mul0); \
    rB = vmul(rB, mul0); \
    r0 = vmul(r0, mul1); \
    r1 = vmul(r1, mul1); \
    r2 = vmul(r2, mul1); \
    r3 = vmul(r3, mul1); \
    r4 = vmul(r4, mul1); \
    r5 = vmul(r5, mul1); \
    r6 = vmul(r6, mul1); \
    r7 = vmul(r7, mul1); \
    r8 = vmul(r8, mul1); \
    r9 = vmul(r9, mul1); \
    rA = vmul(rA, mul1); \
    rB = vmul(rB, mul1); \
}

#define flops_mul_chains12_unroll2_ops48(   \
    vmul,   \
    mul0, mul1,  \
    r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, rA, rB \
){  \
    flops_mul_chains12_ops24(vmul, mul0, mul1, r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, rA, rB);   \
    flops_mul_chains12_ops24(vmul, mul0, mul1, r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, rA, rB);   \
}

const double TEST_MUL_MUL   =   1.4142135623730950488;
const double TEST_MUL_DIV   =   0.70710678118654752440;

inline float reduce(__m256 y){
    __m128 x = _mm_add_ps(_mm256_castps256_ps128(y), _mm256_extractf128_ps(y, 1));
    x = _mm_add_ps(x, _mm_unpackhi_ps(x, x));
    x = _mm_add_ps(x, _mm_shuffle_ps(x, x, 1));
    return _mm_cvtss_f32(x);
}

long run_loop(long iterations, double &result) {
    const __m256 mul0 = _mm256_set1_ps((float)TEST_MUL_MUL);
    const __m256 mul1 = _mm256_set1_ps((float)TEST_MUL_DIV);

    __m256 r0 = _mm256_set1_ps(1.0f);
    __m256 r1 = _mm256_set1_ps(1.1f);
    __m256 r2 = _mm256_set1_ps(1.2f);
    __m256 r3 = _mm256_set1_ps(1.3f);
    __m256 r4 = _mm256_set1_ps(1.4f);
    __m256 r5 = _mm256_set1_ps(1.5f);
    __m256 r6 = _mm256_set1_ps(1.6f);
    __m256 r7 = _mm256_set1_ps(1.7f);
    __m256 r8 = _mm256_set1_ps(1.8f);
    __m256 r9 = _mm256_set1_ps(1.9f);
    __m256 rA = _mm256_set1_ps(2.0f);
    __m256 rB = _mm256_set1_ps(2.1f);

    for (size_t i = 0; i < iterations; i++){
      flops_mul_chains12_unroll2_ops48(
          _mm256_mul_ps,
          mul0, mul1,
          r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, rA, rB
      );
    }
    flops_reduce_chains12(
        _mm256_add_ps,
        r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, rA, rB
    );

    result = reduce(r0);

    return iterations * 8 * 48;
}


int main() {
  long iterations = 1000000000;
  printf( "Single-Precision - 256-bit AVX - Multiply:\n" );
  printf( "   Dependency Chains = 12\n" );

  int threads = 8;
  long thread_iterations[threads];
  double thread_result[threads];

  for(int i=0;i<threads;i++) {
    thread_iterations[i] = 0;
    thread_result[i] = 0.0;
  }

  struct timeval t0,t1;
  gettimeofday(&t0, 0);
#pragma omp parallel num_threads((int)threads)
  {
      size_t thread_id = omp_get_thread_num();

      double result;
      thread_iterations[thread_id] = run_loop(iterations, result);
      thread_result[thread_id] = result;
  }
  gettimeofday(&t1, 0);

  double result = 0;
  iterations = 0;
  for (int i = 0; i < threads; i++){
      result += thread_result[i];
      iterations += thread_iterations[i];
  }

  double seconds = (double)((t1.tv_sec-t0.tv_sec)*1000000 + t1.tv_usec-t0.tv_usec)/1000000;
  printf( "   Result     = %f\n",result );
  printf( "   FP Ops     = %d\n",iterations );
  printf( "   seconds    = %f\n",seconds );
  printf( "   GFlops     = %f\n",iterations / seconds / 1.e9 );
}

This MWE give me the same ~540 GFLOP/s.
I checked assembly and saw that the loop that is doing all the work is not doing 48 ops as you intented, it is actually doing 20. If I change 48 to 20 in the operations done at the return, I get a reasonable result(~230 GFLOP/s)

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions