GridTools · iomaganaris · Feb 6, 2024 · Feb 6, 2024 · Feb 6, 2024 · Feb 7, 2024
diff --git a/setup.py b/setup.py
@@ -93,10 +93,14 @@ def pybind11_extension(m):
         'console_scripts': [
             'sbench=stencil_benchmarks.scripts.sbench:main',
             'sbench-analyze=stencil_benchmarks.scripts.sbench_analyze:main',
+            'sbench-h100-collection=stencil_benchmarks.scripts'
+            '.sbench_h100_collection:main',
             'sbench-a100-collection=stencil_benchmarks.scripts'
             '.sbench_a100_collection:main',
             'sbench-v100-collection=stencil_benchmarks.scripts'
             '.sbench_v100_collection:main',
+            'sbench-p100-collection=stencil_benchmarks.scripts'
+            '.sbench_p100_collection:main',
             'sbench-mi50-collection=stencil_benchmarks.scripts'
             '.sbench_mi50_collection:main',
             'sbench-mi100-collection=stencil_benchmarks.scripts'

diff --git a/stencil_benchmarks/benchmarks_collection/stencils/cuda_hip/mixin.py b/stencil_benchmarks/benchmarks_collection/stencils/cuda_hip/mixin.py
@@ -74,8 +74,10 @@ def setup(self):
         self.compiler_flags = (self.default_compiler_flags() + ' ' +
                                self.compiler_flags).strip()
 
+        filename = self.template_file().partition('.')[0]
+
         try:
-            self.compiled = compilation.GnuLibrary(code, [self.compiler] +
+            self.compiled = compilation.GnuLibrary(code, filename, [self.compiler] +
                                                    self.compiler_flags.split())
         except compilation.CompilationError as error:
             raise ParameterError(*error.args) from error
@@ -114,7 +116,8 @@ def template_args(self):
                     dry_runs=self.dry_runs,
                     timers=self.timers,
                     strides=self.strides,
-                    index_type=self.index_type)
+                    index_type=self.index_type,
+                    implementation_name=self.template_file().partition('.')[0])
 
     @contextlib.contextmanager
     def on_device(self, data):

diff --git a/stencil_benchmarks/benchmarks_collection/stencils/cuda_hip/templates/base.j2 b/stencil_benchmarks/benchmarks_collection/stencils/cuda_hip/templates/base.j2
@@ -55,7 +55,7 @@ using hipDeviceProp = hipDeviceProp_t;
                  }
 
 {% block gpu_kernel %}
-__global__ void gpu_kernel(
+__global__ void gpu_kernel_{{ implementation_name }}(
     {%- for arg in args %}
         {{ ctype }} * __restrict__ {{ arg }}{{ "," if not loop.last }}
     {%- endfor %}
@@ -135,7 +135,7 @@ extern "C" int kernel(
     }
 
     for (int dry_run = 0; dry_run < {{ dry_runs }}; ++dry_run) {
-        gpu_kernel<<<grid_size, block_size, smem_size>>>(
+        gpu_kernel_{{ implementation_name }}<<<grid_size, block_size, smem_size>>>(
             {%- for arg in args %}
                 {{ arg }}{{ "," if not loop.last }}
             {%- endfor %}
@@ -158,9 +158,9 @@ extern "C" int kernel(
     {%- endif %}
 
     {%- if timers == 'hip-ext' %}
-    hipExtLaunchKernelGGL(gpu_kernel, grid_size, block_size, smem_size, 0, start, stop, 0,
+    hipExtLaunchKernelGGL(gpu_kernel_{{ implementation_name }}, grid_size, block_size, smem_size, 0, start, stop, 0,
     {%- else %}
-    gpu_kernel<<<grid_size, block_size, smem_size>>>(
+    gpu_kernel_{{ implementation_name }}<<<grid_size, block_size, smem_size>>>(
     {%- endif %}
         {%- for arg in args %}
             {{ arg }}{{ "," if not loop.last }}

diff --git a/...enchmarks/benchmarks_collection/stencils/cuda_hip/templates/vertical_advection_classic.j2 b/...enchmarks/benchmarks_collection/stencils/cuda_hip/templates/vertical_advection_classic.j2
@@ -186,7 +186,7 @@ __forceinline__ __device__ void forward_sweep(const {{ index_type }} ishift,
 }
 
 
-__global__ void __launch_bounds__({{ block_size[0] * block_size[1] }}) gpu_kernel(
+__global__ void __launch_bounds__({{ block_size[0] * block_size[1] }}) gpu_kernel_{{ implementation_name }}(
     const {{ ctype }} *__restrict__ ustage,
     const {{ ctype }} *__restrict__ upos,
     const {{ ctype }} *__restrict__ utens,

diff --git a/...nchmarks/benchmarks_collection/stencils/cuda_hip/templates/vertical_advection_localmem.j2 b/...nchmarks/benchmarks_collection/stencils/cuda_hip/templates/vertical_advection_localmem.j2
@@ -186,7 +186,7 @@ __forceinline__ __device__ void forward_sweep(const {{ index_type }} ishift,
 }
 
 
-__global__ void __launch_bounds__({{ block_size[0] * block_size[1] }}) gpu_kernel(
+__global__ void __launch_bounds__({{ block_size[0] * block_size[1] }}) gpu_kernel_{{ implementation_name }}(
     const {{ ctype }} *__restrict__ ustage,
     const {{ ctype }} *__restrict__ upos,
     const {{ ctype }} *__restrict__ utens,

diff --git a/...ks/benchmarks_collection/stencils/cuda_hip/templates/vertical_advection_localmemmerged.j2 b/...ks/benchmarks_collection/stencils/cuda_hip/templates/vertical_advection_localmemmerged.j2
@@ -389,7 +389,7 @@ __forceinline__ __device__ void forward_sweep(const {{ ctype }} *__restrict__ wc
 }
 
 
-__global__ void __launch_bounds__({{ block_size[0] * block_size[1] }}) gpu_kernel(
+__global__ void __launch_bounds__({{ block_size[0] * block_size[1] }}) gpu_kernel_{{ implementation_name }}(
     const {{ ctype }} *__restrict__ ustage,
     const {{ ctype }} *__restrict__ upos,
     const {{ ctype }} *__restrict__ utens,

diff --git a/...chmarks/benchmarks_collection/stencils/cuda_hip/templates/vertical_advection_sharedmem.j2 b/...chmarks/benchmarks_collection/stencils/cuda_hip/templates/vertical_advection_sharedmem.j2
@@ -194,7 +194,7 @@ __forceinline__ __device__ void forward_sweep(const {{ index_type }} ishift,
 }
 
 
-__global__ void __launch_bounds__({{ block_size[0] * block_size[1] }}) gpu_kernel(
+__global__ void __launch_bounds__({{ block_size[0] * block_size[1] }}) gpu_kernel_{{ implementation_name }}(
     const {{ ctype }} *__restrict__ ustage,
     const {{ ctype }} *__restrict__ upos,
     const {{ ctype }} *__restrict__ utens,

diff --git a/stencil_benchmarks/benchmarks_collection/stencils/openmp/mixin.py b/stencil_benchmarks/benchmarks_collection/stencils/openmp/mixin.py
@@ -69,7 +69,9 @@ def setup(self):
         if self.compiler.endswith('icpc'):
             os.environ['KMP_INIT_AT_FORK'] = '0'
 
-        self.compiled = compilation.GnuLibrary(code, self.compile_command())
+        filename = self.template_file().partition('.')[0]
+
+        self.compiled = compilation.GnuLibrary(code, filename, self.compile_command())
 
         if self.verify and self.dry_runs:
             warnings.warn(

diff --git a/stencil_benchmarks/benchmarks_collection/stream/cuda_hip.py b/stencil_benchmarks/benchmarks_collection/stream/cuda_hip.py
@@ -78,7 +78,9 @@ def setup(self):
         code = template.render(template_file, **self.template_args())
         if self.print_code:
             print(cpphelpers.format_code(code))
+        filename = 'cuda_hip'
         self.compiled = compilation.GnuLibrary(code,
+                                               filename,
                                                self.compile_command(),
                                                extension='.cu')
 

diff --git a/stencil_benchmarks/benchmarks_collection/stream/mc_calpin.py b/stencil_benchmarks/benchmarks_collection/stream/mc_calpin.py
@@ -67,8 +67,9 @@ def setup(self):
                     r'/\* [a-z ]*"tuned" versions of the kernels \*/(.*)',
                     cpphelpers.format_code(code),
                     re.MULTILINE | re.DOTALL).group(1))
-
+        filename = self.template_file().partition('.')[0]
         self.compiled = compilation.GnuLibrary(code,
+                                               filename,
                                                self.compile_command(),
                                                extension='.c')
 

diff --git a/stencil_benchmarks/scripts/sbench_a100_collection.py b/stencil_benchmarks/scripts/sbench_a100_collection.py
@@ -51,7 +51,6 @@ def main():
                                gpu_architecture='sm_80',
                                verify=False,
                                dry_runs=1,
-                               gpu_timers=True,
                                alignment=128,
                                dtype='float32')
 

diff --git a/stencil_benchmarks/scripts/sbench_analyze.py b/stencil_benchmarks/scripts/sbench_analyze.py
@@ -191,10 +191,10 @@ def plot(csv, uniform, ylim, title, auto_group, group, select, filter,
 
     regexes = []
     for regex in label_regex:
-        splitter = label_regex[0]
-        if label_regex[-1] != splitter:
+        splitter = regex[0]
+        if regex[-1] != splitter:
             raise ValueError('expected input in the form /pattern/repl/')
-        pattern, repl = label_regex[1:-1].split(splitter, 1)
+        pattern, repl = regex[1:-1].split(splitter, 1)
         regexes.append((re.compile(pattern), repl))
 
     for index, row in df.iterrows():

diff --git a/stencil_benchmarks/scripts/sbench_h100_collection.py b/stencil_benchmarks/scripts/sbench_h100_collection.py
@@ -0,0 +1,176 @@
+# Stencil Benchmarks
+#
+# Copyright (c) 2017-2021, ETH Zurich and MeteoSwiss
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its contributors
+# may be used to endorse or promote products derived from this software without
+# specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+
+import click
+
+from stencil_benchmarks.benchmarks_collection.stencils.cuda_hip import (
+    basic, horizontal_diffusion as hdiff, vertical_advection as vadv)
+from stencil_benchmarks.tools.multirun import (Configuration,
+                                               run_scaling_benchmark,
+                                               truncate_block_size_to_domain,
+                                               default_kwargs)
+
+
+@click.group()
+def main():
+    pass
+
+
+common_kwargs = default_kwargs(backend='cuda',
+                               compiler='nvcc',
+                               gpu_architecture='sm_90',
+                               verify=False,
+                               dry_runs=1,
+                               alignment=128,
+                               dtype='float32')
+
+
+@main.command()
+@click.argument('output', type=click.Path())
+@click.option('--executions', '-e', type=int, default=101)
+@click.option('--option', '-o', multiple=True)
+def basic_bandwidth(output, executions, option):
+    kwargs = common_kwargs(
+        option,
+        loop='3D',
+        block_size=(128, 2, 1),
+        halo=(1, 1, 1),
+    )
+
+    stream_kwargs = kwargs.copy()
+    stream_kwargs.update(loop='1D', block_size=(1024, 1, 1), halo=(0, 0, 0))
+
+    configurations = [
+        Configuration(basic.Copy, name='stream', **stream_kwargs),
+        Configuration(basic.Empty, name='empty', **kwargs),
+        Configuration(basic.Copy, name='copy', **kwargs),
+        Configuration(basic.OnesidedAverage, name='avg-i', axis=0, **kwargs),
+        Configuration(basic.OnesidedAverage, name='avg-j', axis=1, **kwargs),
+        Configuration(basic.OnesidedAverage, name='avg-k', axis=2, **kwargs),
+        Configuration(basic.SymmetricAverage,
+                      name='sym-avg-i',
+                      axis=0,
+                      **kwargs),
+        Configuration(basic.SymmetricAverage,
+                      name='sym-avg-j',
+                      axis=1,
+                      **kwargs),
+        Configuration(basic.SymmetricAverage,
+                      name='sym-avg-k',
+                      axis=2,
+                      **kwargs),
+        Configuration(basic.Laplacian,
+                      name='lap-ij',
+                      along_x=True,
+                      along_y=True,
+                      along_z=False,
+                      **kwargs)
+    ]
+
+    table = run_scaling_benchmark(configurations, executions)
+    table.to_csv(output)
+
+
+@main.command()
+@click.argument('output', type=click.Path())
+@click.option('--executions', '-e', type=int, default=101)
+@click.option('--option', '-o', multiple=True)
+def horizontal_diffusion_bandwidth(output, executions, option):
+    kwargs = common_kwargs(option)
+
+    configurations = [
+        Configuration(hdiff.Classic, block_size=(32, 12, 1), **kwargs),
+        Configuration(hdiff.OnTheFly,
+                      block_size=(32, 16, 1),
+                      loop='3D',
+                      **kwargs),
+        Configuration(hdiff.OnTheFlyIncache, block_size=(32, 8, 1), **kwargs),
+        Configuration(hdiff.JScanSharedMem, block_size=(256, 32, 1), **kwargs),
+        Configuration(hdiff.JScanOtfIncache, block_size=(128, 4, 1), **kwargs),
+        Configuration(hdiff.JScanOtf, block_size=(128, 4, 1), **kwargs),
+        Configuration(hdiff.JScanShuffleIncache,
+                      block_size=(28, 8, 2),
+                      **kwargs),
+        Configuration(hdiff.JScanShuffle, block_size=(28, 8, 2), **kwargs),
+        Configuration(hdiff.JScanShuffleSystolic,
+                      block_size=(28, 4, 3),
+                      **kwargs)
+    ]
+
+    def truncate_block_size_to_domain_if_possible(**kwargs):
+        if kwargs['block_size'][0] != 28:
+            return truncate_block_size_to_domain(**kwargs)
+        return kwargs
+
+    table = run_scaling_benchmark(
+        configurations,
+        executions,
+        preprocess_args=truncate_block_size_to_domain_if_possible)
+    table.to_csv(output)
+
+
+@main.command()
+@click.argument('output', type=click.Path())
+@click.option('--executions', '-e', type=int, default=101)
+@click.option('--option', '-o', multiple=True)
+def vertical_advection_bandwidth(output, executions, option):
+    kwargs = common_kwargs(option)
+
+    configurations = [
+        Configuration(vadv.Classic,
+                      block_size=(128, 1),
+                      unroll_factor=8,
+                      **kwargs),
+        Configuration(vadv.LocalMem,
+                      block_size=(128, 1),
+                      unroll_factor=28,
+                      **kwargs),
+        Configuration(vadv.SharedMem,
+                      block_size=(64, 1),
+                      unroll_factor=0,
+                      **kwargs),
+        Configuration(vadv.LocalMemMerged,
+                      block_size=(128, 1),
+                      unroll_factor=2,
+                      **kwargs)
+    ]
+
+    table = run_scaling_benchmark(
+        configurations,
+        executions,
+        preprocess_args=truncate_block_size_to_domain)
+    table.to_csv(output)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/stencil_benchmarks/scripts/sbench_mi100_collection.py b/stencil_benchmarks/scripts/sbench_mi100_collection.py
@@ -51,7 +51,6 @@ def main():
                                gpu_architecture='gfx908',
                                verify=False,
                                dry_runs=100,
-                               gpu_timers=True,
                                alignment=512,
                                dtype='float32')
 

diff --git a/stencil_benchmarks/scripts/sbench_mi50_collection.py b/stencil_benchmarks/scripts/sbench_mi50_collection.py
@@ -51,7 +51,6 @@ def main():
                                gpu_architecture='gfx906',
                                verify=False,
                                dry_runs=1,
-                               gpu_timers=True,
                                alignment=64,
                                dtype='float32')