diff --git a/stencil_benchmarks/benchmarks_collection/stencils/cuda_hip/mixin.py b/stencil_benchmarks/benchmarks_collection/stencils/cuda_hip/mixin.py index 0b3b334..c263c9e 100644 --- a/stencil_benchmarks/benchmarks_collection/stencils/cuda_hip/mixin.py +++ b/stencil_benchmarks/benchmarks_collection/stencils/cuda_hip/mixin.py @@ -114,7 +114,8 @@ def template_args(self): dry_runs=self.dry_runs, timers=self.timers, strides=self.strides, - index_type=self.index_type) + index_type=self.index_type, + implementation_name=self.template_file().partition('.')[0]) @contextlib.contextmanager def on_device(self, data): diff --git a/stencil_benchmarks/benchmarks_collection/stencils/cuda_hip/templates/base.j2 b/stencil_benchmarks/benchmarks_collection/stencils/cuda_hip/templates/base.j2 index 1fe1d01..60d67b7 100644 --- a/stencil_benchmarks/benchmarks_collection/stencils/cuda_hip/templates/base.j2 +++ b/stencil_benchmarks/benchmarks_collection/stencils/cuda_hip/templates/base.j2 @@ -55,7 +55,7 @@ using hipDeviceProp = hipDeviceProp_t; } {% block gpu_kernel %} -__global__ void gpu_kernel( +__global__ void gpu_kernel_{{ implementation_name }}( {%- for arg in args %} {{ ctype }} * __restrict__ {{ arg }}{{ "," if not loop.last }} {%- endfor %} @@ -135,7 +135,7 @@ extern "C" int kernel( } for (int dry_run = 0; dry_run < {{ dry_runs }}; ++dry_run) { - gpu_kernel<<>>( + gpu_kernel_{{ implementation_name }}<<>>( {%- for arg in args %} {{ arg }}{{ "," if not loop.last }} {%- endfor %} @@ -158,9 +158,9 @@ extern "C" int kernel( {%- endif %} {%- if timers == 'hip-ext' %} - hipExtLaunchKernelGGL(gpu_kernel, grid_size, block_size, smem_size, 0, start, stop, 0, + hipExtLaunchKernelGGL(gpu_kernel_{{ implementation_name }}, grid_size, block_size, smem_size, 0, start, stop, 0, {%- else %} - gpu_kernel<<>>( + gpu_kernel_{{ implementation_name }}<<>>( {%- endif %} {%- for arg in args %} {{ arg }}{{ "," if not loop.last }} diff --git a/stencil_benchmarks/benchmarks_collection/stencils/cuda_hip/templates/vertical_advection_classic.j2 b/stencil_benchmarks/benchmarks_collection/stencils/cuda_hip/templates/vertical_advection_classic.j2 index e297fb1..0c7e731 100644 --- a/stencil_benchmarks/benchmarks_collection/stencils/cuda_hip/templates/vertical_advection_classic.j2 +++ b/stencil_benchmarks/benchmarks_collection/stencils/cuda_hip/templates/vertical_advection_classic.j2 @@ -186,7 +186,7 @@ __forceinline__ __device__ void forward_sweep(const {{ index_type }} ishift, } -__global__ void __launch_bounds__({{ block_size[0] * block_size[1] }}) gpu_kernel( +__global__ void __launch_bounds__({{ block_size[0] * block_size[1] }}) gpu_kernel_{{ implementation_name }}( const {{ ctype }} *__restrict__ ustage, const {{ ctype }} *__restrict__ upos, const {{ ctype }} *__restrict__ utens, diff --git a/stencil_benchmarks/benchmarks_collection/stencils/cuda_hip/templates/vertical_advection_localmem.j2 b/stencil_benchmarks/benchmarks_collection/stencils/cuda_hip/templates/vertical_advection_localmem.j2 index 708828c..233d86d 100644 --- a/stencil_benchmarks/benchmarks_collection/stencils/cuda_hip/templates/vertical_advection_localmem.j2 +++ b/stencil_benchmarks/benchmarks_collection/stencils/cuda_hip/templates/vertical_advection_localmem.j2 @@ -186,7 +186,7 @@ __forceinline__ __device__ void forward_sweep(const {{ index_type }} ishift, } -__global__ void __launch_bounds__({{ block_size[0] * block_size[1] }}) gpu_kernel( +__global__ void __launch_bounds__({{ block_size[0] * block_size[1] }}) gpu_kernel_{{ implementation_name }}( const {{ ctype }} *__restrict__ ustage, const {{ ctype }} *__restrict__ upos, const {{ ctype }} *__restrict__ utens, diff --git a/stencil_benchmarks/benchmarks_collection/stencils/cuda_hip/templates/vertical_advection_localmemmerged.j2 b/stencil_benchmarks/benchmarks_collection/stencils/cuda_hip/templates/vertical_advection_localmemmerged.j2 index c3123ea..261eafb 100644 --- a/stencil_benchmarks/benchmarks_collection/stencils/cuda_hip/templates/vertical_advection_localmemmerged.j2 +++ b/stencil_benchmarks/benchmarks_collection/stencils/cuda_hip/templates/vertical_advection_localmemmerged.j2 @@ -389,7 +389,7 @@ __forceinline__ __device__ void forward_sweep(const {{ ctype }} *__restrict__ wc } -__global__ void __launch_bounds__({{ block_size[0] * block_size[1] }}) gpu_kernel( +__global__ void __launch_bounds__({{ block_size[0] * block_size[1] }}) gpu_kernel_{{ implementation_name }}( const {{ ctype }} *__restrict__ ustage, const {{ ctype }} *__restrict__ upos, const {{ ctype }} *__restrict__ utens, diff --git a/stencil_benchmarks/benchmarks_collection/stencils/cuda_hip/templates/vertical_advection_sharedmem.j2 b/stencil_benchmarks/benchmarks_collection/stencils/cuda_hip/templates/vertical_advection_sharedmem.j2 index de0102c..22dfc63 100644 --- a/stencil_benchmarks/benchmarks_collection/stencils/cuda_hip/templates/vertical_advection_sharedmem.j2 +++ b/stencil_benchmarks/benchmarks_collection/stencils/cuda_hip/templates/vertical_advection_sharedmem.j2 @@ -194,7 +194,7 @@ __forceinline__ __device__ void forward_sweep(const {{ index_type }} ishift, } -__global__ void __launch_bounds__({{ block_size[0] * block_size[1] }}) gpu_kernel( +__global__ void __launch_bounds__({{ block_size[0] * block_size[1] }}) gpu_kernel_{{ implementation_name }}( const {{ ctype }} *__restrict__ ustage, const {{ ctype }} *__restrict__ upos, const {{ ctype }} *__restrict__ utens, diff --git a/stencil_benchmarks/tools/compilation.py b/stencil_benchmarks/tools/compilation.py index ed113b7..45a1870 100644 --- a/stencil_benchmarks/tools/compilation.py +++ b/stencil_benchmarks/tools/compilation.py @@ -122,7 +122,7 @@ def __init__(self, compile_command = ['gcc'] if extension.lower() == '.c' else ['g++'] if compile_command[0].endswith('nvcc'): - compile_command += ['-Xcompiler', '-shared', '-Xcompiler', '-fPIC'] + compile_command += ['-Xcompiler', '-shared', '-Xcompiler', '-fPIC', '--generate-line-info'] else: compile_command += ['-shared', '-fPIC']