diff --git a/stencil_benchmarks/benchmarks_collection/stencils/cuda_hip/mixin.py b/stencil_benchmarks/benchmarks_collection/stencils/cuda_hip/mixin.py
index 0b3b334..c263c9e 100644
--- a/stencil_benchmarks/benchmarks_collection/stencils/cuda_hip/mixin.py
+++ b/stencil_benchmarks/benchmarks_collection/stencils/cuda_hip/mixin.py
@@ -114,7 +114,8 @@ def template_args(self):
                     dry_runs=self.dry_runs,
                     timers=self.timers,
                     strides=self.strides,
-                    index_type=self.index_type)
+                    index_type=self.index_type,
+                    implementation_name=self.template_file().partition('.')[0])
 
     @contextlib.contextmanager
     def on_device(self, data):
diff --git a/stencil_benchmarks/benchmarks_collection/stencils/cuda_hip/templates/base.j2 b/stencil_benchmarks/benchmarks_collection/stencils/cuda_hip/templates/base.j2
index 1fe1d01..60d67b7 100644
--- a/stencil_benchmarks/benchmarks_collection/stencils/cuda_hip/templates/base.j2
+++ b/stencil_benchmarks/benchmarks_collection/stencils/cuda_hip/templates/base.j2
@@ -55,7 +55,7 @@ using hipDeviceProp = hipDeviceProp_t;
                  }
 
 {% block gpu_kernel %}
-__global__ void gpu_kernel(
+__global__ void gpu_kernel_{{ implementation_name }}(
     {%- for arg in args %}
         {{ ctype }} * __restrict__ {{ arg }}{{ "," if not loop.last }}
     {%- endfor %}
@@ -135,7 +135,7 @@ extern "C" int kernel(
     }
 
     for (int dry_run = 0; dry_run < {{ dry_runs }}; ++dry_run) {
-        gpu_kernel<<<grid_size, block_size, smem_size>>>(
+        gpu_kernel_{{ implementation_name }}<<<grid_size, block_size, smem_size>>>(
             {%- for arg in args %}
                 {{ arg }}{{ "," if not loop.last }}
             {%- endfor %}
@@ -158,9 +158,9 @@ extern "C" int kernel(
     {%- endif %}
 
     {%- if timers == 'hip-ext' %}
-    hipExtLaunchKernelGGL(gpu_kernel, grid_size, block_size, smem_size, 0, start, stop, 0,
+    hipExtLaunchKernelGGL(gpu_kernel_{{ implementation_name }}, grid_size, block_size, smem_size, 0, start, stop, 0,
     {%- else %}
-    gpu_kernel<<<grid_size, block_size, smem_size>>>(
+    gpu_kernel_{{ implementation_name }}<<<grid_size, block_size, smem_size>>>(
     {%- endif %}
         {%- for arg in args %}
             {{ arg }}{{ "," if not loop.last }}
diff --git a/stencil_benchmarks/benchmarks_collection/stencils/cuda_hip/templates/vertical_advection_classic.j2 b/stencil_benchmarks/benchmarks_collection/stencils/cuda_hip/templates/vertical_advection_classic.j2
index e297fb1..0c7e731 100644
--- a/stencil_benchmarks/benchmarks_collection/stencils/cuda_hip/templates/vertical_advection_classic.j2
+++ b/stencil_benchmarks/benchmarks_collection/stencils/cuda_hip/templates/vertical_advection_classic.j2
@@ -186,7 +186,7 @@ __forceinline__ __device__ void forward_sweep(const {{ index_type }} ishift,
 }
 
 
-__global__ void __launch_bounds__({{ block_size[0] * block_size[1] }}) gpu_kernel(
+__global__ void __launch_bounds__({{ block_size[0] * block_size[1] }}) gpu_kernel_{{ implementation_name }}(
     const {{ ctype }} *__restrict__ ustage,
     const {{ ctype }} *__restrict__ upos,
     const {{ ctype }} *__restrict__ utens,
diff --git a/stencil_benchmarks/benchmarks_collection/stencils/cuda_hip/templates/vertical_advection_localmem.j2 b/stencil_benchmarks/benchmarks_collection/stencils/cuda_hip/templates/vertical_advection_localmem.j2
index 708828c..233d86d 100644
--- a/stencil_benchmarks/benchmarks_collection/stencils/cuda_hip/templates/vertical_advection_localmem.j2
+++ b/stencil_benchmarks/benchmarks_collection/stencils/cuda_hip/templates/vertical_advection_localmem.j2
@@ -186,7 +186,7 @@ __forceinline__ __device__ void forward_sweep(const {{ index_type }} ishift,
 }
 
 
-__global__ void __launch_bounds__({{ block_size[0] * block_size[1] }}) gpu_kernel(
+__global__ void __launch_bounds__({{ block_size[0] * block_size[1] }}) gpu_kernel_{{ implementation_name }}(
     const {{ ctype }} *__restrict__ ustage,
     const {{ ctype }} *__restrict__ upos,
     const {{ ctype }} *__restrict__ utens,
diff --git a/stencil_benchmarks/benchmarks_collection/stencils/cuda_hip/templates/vertical_advection_localmemmerged.j2 b/stencil_benchmarks/benchmarks_collection/stencils/cuda_hip/templates/vertical_advection_localmemmerged.j2
index c3123ea..261eafb 100644
--- a/stencil_benchmarks/benchmarks_collection/stencils/cuda_hip/templates/vertical_advection_localmemmerged.j2
+++ b/stencil_benchmarks/benchmarks_collection/stencils/cuda_hip/templates/vertical_advection_localmemmerged.j2
@@ -389,7 +389,7 @@ __forceinline__ __device__ void forward_sweep(const {{ ctype }} *__restrict__ wc
 }
 
 
-__global__ void __launch_bounds__({{ block_size[0] * block_size[1] }}) gpu_kernel(
+__global__ void __launch_bounds__({{ block_size[0] * block_size[1] }}) gpu_kernel_{{ implementation_name }}(
     const {{ ctype }} *__restrict__ ustage,
     const {{ ctype }} *__restrict__ upos,
     const {{ ctype }} *__restrict__ utens,
diff --git a/stencil_benchmarks/benchmarks_collection/stencils/cuda_hip/templates/vertical_advection_sharedmem.j2 b/stencil_benchmarks/benchmarks_collection/stencils/cuda_hip/templates/vertical_advection_sharedmem.j2
index de0102c..22dfc63 100644
--- a/stencil_benchmarks/benchmarks_collection/stencils/cuda_hip/templates/vertical_advection_sharedmem.j2
+++ b/stencil_benchmarks/benchmarks_collection/stencils/cuda_hip/templates/vertical_advection_sharedmem.j2
@@ -194,7 +194,7 @@ __forceinline__ __device__ void forward_sweep(const {{ index_type }} ishift,
 }
 
 
-__global__ void __launch_bounds__({{ block_size[0] * block_size[1] }}) gpu_kernel(
+__global__ void __launch_bounds__({{ block_size[0] * block_size[1] }}) gpu_kernel_{{ implementation_name }}(
     const {{ ctype }} *__restrict__ ustage,
     const {{ ctype }} *__restrict__ upos,
     const {{ ctype }} *__restrict__ utens,
diff --git a/stencil_benchmarks/tools/compilation.py b/stencil_benchmarks/tools/compilation.py
index ed113b7..45a1870 100644
--- a/stencil_benchmarks/tools/compilation.py
+++ b/stencil_benchmarks/tools/compilation.py
@@ -122,7 +122,7 @@ def __init__(self,
             compile_command = ['gcc'] if extension.lower() == '.c' else ['g++']
 
         if compile_command[0].endswith('nvcc'):
-            compile_command += ['-Xcompiler', '-shared', '-Xcompiler', '-fPIC']
+            compile_command += ['-Xcompiler', '-shared', '-Xcompiler', '-fPIC', '--generate-line-info']
         else:
             compile_command += ['-shared', '-fPIC']