diff --git a/setup.py b/setup.py index 874cef6..9e4494d 100644 --- a/setup.py +++ b/setup.py @@ -93,10 +93,14 @@ def pybind11_extension(m): 'console_scripts': [ 'sbench=stencil_benchmarks.scripts.sbench:main', 'sbench-analyze=stencil_benchmarks.scripts.sbench_analyze:main', + 'sbench-h100-collection=stencil_benchmarks.scripts' + '.sbench_h100_collection:main', 'sbench-a100-collection=stencil_benchmarks.scripts' '.sbench_a100_collection:main', 'sbench-v100-collection=stencil_benchmarks.scripts' '.sbench_v100_collection:main', + 'sbench-p100-collection=stencil_benchmarks.scripts' + '.sbench_p100_collection:main', 'sbench-mi50-collection=stencil_benchmarks.scripts' '.sbench_mi50_collection:main', 'sbench-mi100-collection=stencil_benchmarks.scripts' diff --git a/stencil_benchmarks/benchmarks_collection/stencils/cuda_hip/mixin.py b/stencil_benchmarks/benchmarks_collection/stencils/cuda_hip/mixin.py index 0b3b334..654d082 100644 --- a/stencil_benchmarks/benchmarks_collection/stencils/cuda_hip/mixin.py +++ b/stencil_benchmarks/benchmarks_collection/stencils/cuda_hip/mixin.py @@ -74,8 +74,10 @@ def setup(self): self.compiler_flags = (self.default_compiler_flags() + ' ' + self.compiler_flags).strip() + filename = self.template_file().partition('.')[0] + try: - self.compiled = compilation.GnuLibrary(code, [self.compiler] + + self.compiled = compilation.GnuLibrary(code, filename, [self.compiler] + self.compiler_flags.split()) except compilation.CompilationError as error: raise ParameterError(*error.args) from error @@ -114,7 +116,8 @@ def template_args(self): dry_runs=self.dry_runs, timers=self.timers, strides=self.strides, - index_type=self.index_type) + index_type=self.index_type, + implementation_name=self.template_file().partition('.')[0]) @contextlib.contextmanager def on_device(self, data): diff --git a/stencil_benchmarks/benchmarks_collection/stencils/cuda_hip/templates/base.j2 b/stencil_benchmarks/benchmarks_collection/stencils/cuda_hip/templates/base.j2 index 1fe1d01..60d67b7 100644 --- a/stencil_benchmarks/benchmarks_collection/stencils/cuda_hip/templates/base.j2 +++ b/stencil_benchmarks/benchmarks_collection/stencils/cuda_hip/templates/base.j2 @@ -55,7 +55,7 @@ using hipDeviceProp = hipDeviceProp_t; } {% block gpu_kernel %} -__global__ void gpu_kernel( +__global__ void gpu_kernel_{{ implementation_name }}( {%- for arg in args %} {{ ctype }} * __restrict__ {{ arg }}{{ "," if not loop.last }} {%- endfor %} @@ -135,7 +135,7 @@ extern "C" int kernel( } for (int dry_run = 0; dry_run < {{ dry_runs }}; ++dry_run) { - gpu_kernel<<>>( + gpu_kernel_{{ implementation_name }}<<>>( {%- for arg in args %} {{ arg }}{{ "," if not loop.last }} {%- endfor %} @@ -158,9 +158,9 @@ extern "C" int kernel( {%- endif %} {%- if timers == 'hip-ext' %} - hipExtLaunchKernelGGL(gpu_kernel, grid_size, block_size, smem_size, 0, start, stop, 0, + hipExtLaunchKernelGGL(gpu_kernel_{{ implementation_name }}, grid_size, block_size, smem_size, 0, start, stop, 0, {%- else %} - gpu_kernel<<>>( + gpu_kernel_{{ implementation_name }}<<>>( {%- endif %} {%- for arg in args %} {{ arg }}{{ "," if not loop.last }} diff --git a/stencil_benchmarks/benchmarks_collection/stencils/cuda_hip/templates/vertical_advection_classic.j2 b/stencil_benchmarks/benchmarks_collection/stencils/cuda_hip/templates/vertical_advection_classic.j2 index e297fb1..0c7e731 100644 --- a/stencil_benchmarks/benchmarks_collection/stencils/cuda_hip/templates/vertical_advection_classic.j2 +++ b/stencil_benchmarks/benchmarks_collection/stencils/cuda_hip/templates/vertical_advection_classic.j2 @@ -186,7 +186,7 @@ __forceinline__ __device__ void forward_sweep(const {{ index_type }} ishift, } -__global__ void __launch_bounds__({{ block_size[0] * block_size[1] }}) gpu_kernel( +__global__ void __launch_bounds__({{ block_size[0] * block_size[1] }}) gpu_kernel_{{ implementation_name }}( const {{ ctype }} *__restrict__ ustage, const {{ ctype }} *__restrict__ upos, const {{ ctype }} *__restrict__ utens, diff --git a/stencil_benchmarks/benchmarks_collection/stencils/cuda_hip/templates/vertical_advection_localmem.j2 b/stencil_benchmarks/benchmarks_collection/stencils/cuda_hip/templates/vertical_advection_localmem.j2 index 708828c..233d86d 100644 --- a/stencil_benchmarks/benchmarks_collection/stencils/cuda_hip/templates/vertical_advection_localmem.j2 +++ b/stencil_benchmarks/benchmarks_collection/stencils/cuda_hip/templates/vertical_advection_localmem.j2 @@ -186,7 +186,7 @@ __forceinline__ __device__ void forward_sweep(const {{ index_type }} ishift, } -__global__ void __launch_bounds__({{ block_size[0] * block_size[1] }}) gpu_kernel( +__global__ void __launch_bounds__({{ block_size[0] * block_size[1] }}) gpu_kernel_{{ implementation_name }}( const {{ ctype }} *__restrict__ ustage, const {{ ctype }} *__restrict__ upos, const {{ ctype }} *__restrict__ utens, diff --git a/stencil_benchmarks/benchmarks_collection/stencils/cuda_hip/templates/vertical_advection_localmemmerged.j2 b/stencil_benchmarks/benchmarks_collection/stencils/cuda_hip/templates/vertical_advection_localmemmerged.j2 index c3123ea..261eafb 100644 --- a/stencil_benchmarks/benchmarks_collection/stencils/cuda_hip/templates/vertical_advection_localmemmerged.j2 +++ b/stencil_benchmarks/benchmarks_collection/stencils/cuda_hip/templates/vertical_advection_localmemmerged.j2 @@ -389,7 +389,7 @@ __forceinline__ __device__ void forward_sweep(const {{ ctype }} *__restrict__ wc } -__global__ void __launch_bounds__({{ block_size[0] * block_size[1] }}) gpu_kernel( +__global__ void __launch_bounds__({{ block_size[0] * block_size[1] }}) gpu_kernel_{{ implementation_name }}( const {{ ctype }} *__restrict__ ustage, const {{ ctype }} *__restrict__ upos, const {{ ctype }} *__restrict__ utens, diff --git a/stencil_benchmarks/benchmarks_collection/stencils/cuda_hip/templates/vertical_advection_sharedmem.j2 b/stencil_benchmarks/benchmarks_collection/stencils/cuda_hip/templates/vertical_advection_sharedmem.j2 index de0102c..22dfc63 100644 --- a/stencil_benchmarks/benchmarks_collection/stencils/cuda_hip/templates/vertical_advection_sharedmem.j2 +++ b/stencil_benchmarks/benchmarks_collection/stencils/cuda_hip/templates/vertical_advection_sharedmem.j2 @@ -194,7 +194,7 @@ __forceinline__ __device__ void forward_sweep(const {{ index_type }} ishift, } -__global__ void __launch_bounds__({{ block_size[0] * block_size[1] }}) gpu_kernel( +__global__ void __launch_bounds__({{ block_size[0] * block_size[1] }}) gpu_kernel_{{ implementation_name }}( const {{ ctype }} *__restrict__ ustage, const {{ ctype }} *__restrict__ upos, const {{ ctype }} *__restrict__ utens, diff --git a/stencil_benchmarks/benchmarks_collection/stencils/openmp/mixin.py b/stencil_benchmarks/benchmarks_collection/stencils/openmp/mixin.py index 3d63581..b41c622 100644 --- a/stencil_benchmarks/benchmarks_collection/stencils/openmp/mixin.py +++ b/stencil_benchmarks/benchmarks_collection/stencils/openmp/mixin.py @@ -69,7 +69,9 @@ def setup(self): if self.compiler.endswith('icpc'): os.environ['KMP_INIT_AT_FORK'] = '0' - self.compiled = compilation.GnuLibrary(code, self.compile_command()) + filename = self.template_file().partition('.')[0] + + self.compiled = compilation.GnuLibrary(code, filename, self.compile_command()) if self.verify and self.dry_runs: warnings.warn( diff --git a/stencil_benchmarks/benchmarks_collection/stream/cuda_hip.py b/stencil_benchmarks/benchmarks_collection/stream/cuda_hip.py index 8cbc7fe..8939e37 100644 --- a/stencil_benchmarks/benchmarks_collection/stream/cuda_hip.py +++ b/stencil_benchmarks/benchmarks_collection/stream/cuda_hip.py @@ -78,7 +78,9 @@ def setup(self): code = template.render(template_file, **self.template_args()) if self.print_code: print(cpphelpers.format_code(code)) + filename = 'cuda_hip' self.compiled = compilation.GnuLibrary(code, + filename, self.compile_command(), extension='.cu') diff --git a/stencil_benchmarks/benchmarks_collection/stream/mc_calpin.py b/stencil_benchmarks/benchmarks_collection/stream/mc_calpin.py index e8af96a..42ba9a1 100644 --- a/stencil_benchmarks/benchmarks_collection/stream/mc_calpin.py +++ b/stencil_benchmarks/benchmarks_collection/stream/mc_calpin.py @@ -67,8 +67,9 @@ def setup(self): r'/\* [a-z ]*"tuned" versions of the kernels \*/(.*)', cpphelpers.format_code(code), re.MULTILINE | re.DOTALL).group(1)) - + filename = self.template_file().partition('.')[0] self.compiled = compilation.GnuLibrary(code, + filename, self.compile_command(), extension='.c') diff --git a/stencil_benchmarks/scripts/sbench_a100_collection.py b/stencil_benchmarks/scripts/sbench_a100_collection.py index a5e9486..0c2631c 100644 --- a/stencil_benchmarks/scripts/sbench_a100_collection.py +++ b/stencil_benchmarks/scripts/sbench_a100_collection.py @@ -51,7 +51,6 @@ def main(): gpu_architecture='sm_80', verify=False, dry_runs=1, - gpu_timers=True, alignment=128, dtype='float32') diff --git a/stencil_benchmarks/scripts/sbench_analyze.py b/stencil_benchmarks/scripts/sbench_analyze.py index 9c27f81..5d324db 100644 --- a/stencil_benchmarks/scripts/sbench_analyze.py +++ b/stencil_benchmarks/scripts/sbench_analyze.py @@ -191,10 +191,10 @@ def plot(csv, uniform, ylim, title, auto_group, group, select, filter, regexes = [] for regex in label_regex: - splitter = label_regex[0] - if label_regex[-1] != splitter: + splitter = regex[0] + if regex[-1] != splitter: raise ValueError('expected input in the form /pattern/repl/') - pattern, repl = label_regex[1:-1].split(splitter, 1) + pattern, repl = regex[1:-1].split(splitter, 1) regexes.append((re.compile(pattern), repl)) for index, row in df.iterrows(): diff --git a/stencil_benchmarks/scripts/sbench_h100_collection.py b/stencil_benchmarks/scripts/sbench_h100_collection.py new file mode 100644 index 0000000..71b7f40 --- /dev/null +++ b/stencil_benchmarks/scripts/sbench_h100_collection.py @@ -0,0 +1,176 @@ +# Stencil Benchmarks +# +# Copyright (c) 2017-2021, ETH Zurich and MeteoSwiss +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software without +# specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. +# +# SPDX-License-Identifier: BSD-3-Clause + +import click + +from stencil_benchmarks.benchmarks_collection.stencils.cuda_hip import ( + basic, horizontal_diffusion as hdiff, vertical_advection as vadv) +from stencil_benchmarks.tools.multirun import (Configuration, + run_scaling_benchmark, + truncate_block_size_to_domain, + default_kwargs) + + +@click.group() +def main(): + pass + + +common_kwargs = default_kwargs(backend='cuda', + compiler='nvcc', + gpu_architecture='sm_90', + verify=False, + dry_runs=1, + alignment=128, + dtype='float32') + + +@main.command() +@click.argument('output', type=click.Path()) +@click.option('--executions', '-e', type=int, default=101) +@click.option('--option', '-o', multiple=True) +def basic_bandwidth(output, executions, option): + kwargs = common_kwargs( + option, + loop='3D', + block_size=(128, 2, 1), + halo=(1, 1, 1), + ) + + stream_kwargs = kwargs.copy() + stream_kwargs.update(loop='1D', block_size=(1024, 1, 1), halo=(0, 0, 0)) + + configurations = [ + Configuration(basic.Copy, name='stream', **stream_kwargs), + Configuration(basic.Empty, name='empty', **kwargs), + Configuration(basic.Copy, name='copy', **kwargs), + Configuration(basic.OnesidedAverage, name='avg-i', axis=0, **kwargs), + Configuration(basic.OnesidedAverage, name='avg-j', axis=1, **kwargs), + Configuration(basic.OnesidedAverage, name='avg-k', axis=2, **kwargs), + Configuration(basic.SymmetricAverage, + name='sym-avg-i', + axis=0, + **kwargs), + Configuration(basic.SymmetricAverage, + name='sym-avg-j', + axis=1, + **kwargs), + Configuration(basic.SymmetricAverage, + name='sym-avg-k', + axis=2, + **kwargs), + Configuration(basic.Laplacian, + name='lap-ij', + along_x=True, + along_y=True, + along_z=False, + **kwargs) + ] + + table = run_scaling_benchmark(configurations, executions) + table.to_csv(output) + + +@main.command() +@click.argument('output', type=click.Path()) +@click.option('--executions', '-e', type=int, default=101) +@click.option('--option', '-o', multiple=True) +def horizontal_diffusion_bandwidth(output, executions, option): + kwargs = common_kwargs(option) + + configurations = [ + Configuration(hdiff.Classic, block_size=(32, 12, 1), **kwargs), + Configuration(hdiff.OnTheFly, + block_size=(32, 16, 1), + loop='3D', + **kwargs), + Configuration(hdiff.OnTheFlyIncache, block_size=(32, 8, 1), **kwargs), + Configuration(hdiff.JScanSharedMem, block_size=(256, 32, 1), **kwargs), + Configuration(hdiff.JScanOtfIncache, block_size=(128, 4, 1), **kwargs), + Configuration(hdiff.JScanOtf, block_size=(128, 4, 1), **kwargs), + Configuration(hdiff.JScanShuffleIncache, + block_size=(28, 8, 2), + **kwargs), + Configuration(hdiff.JScanShuffle, block_size=(28, 8, 2), **kwargs), + Configuration(hdiff.JScanShuffleSystolic, + block_size=(28, 4, 3), + **kwargs) + ] + + def truncate_block_size_to_domain_if_possible(**kwargs): + if kwargs['block_size'][0] != 28: + return truncate_block_size_to_domain(**kwargs) + return kwargs + + table = run_scaling_benchmark( + configurations, + executions, + preprocess_args=truncate_block_size_to_domain_if_possible) + table.to_csv(output) + + +@main.command() +@click.argument('output', type=click.Path()) +@click.option('--executions', '-e', type=int, default=101) +@click.option('--option', '-o', multiple=True) +def vertical_advection_bandwidth(output, executions, option): + kwargs = common_kwargs(option) + + configurations = [ + Configuration(vadv.Classic, + block_size=(128, 1), + unroll_factor=8, + **kwargs), + Configuration(vadv.LocalMem, + block_size=(128, 1), + unroll_factor=28, + **kwargs), + Configuration(vadv.SharedMem, + block_size=(64, 1), + unroll_factor=0, + **kwargs), + Configuration(vadv.LocalMemMerged, + block_size=(128, 1), + unroll_factor=2, + **kwargs) + ] + + table = run_scaling_benchmark( + configurations, + executions, + preprocess_args=truncate_block_size_to_domain) + table.to_csv(output) + + +if __name__ == '__main__': + main() diff --git a/stencil_benchmarks/scripts/sbench_mi100_collection.py b/stencil_benchmarks/scripts/sbench_mi100_collection.py index 10de235..ca96336 100644 --- a/stencil_benchmarks/scripts/sbench_mi100_collection.py +++ b/stencil_benchmarks/scripts/sbench_mi100_collection.py @@ -51,7 +51,6 @@ def main(): gpu_architecture='gfx908', verify=False, dry_runs=100, - gpu_timers=True, alignment=512, dtype='float32') diff --git a/stencil_benchmarks/scripts/sbench_mi50_collection.py b/stencil_benchmarks/scripts/sbench_mi50_collection.py index 5b94f8e..68ed22a 100644 --- a/stencil_benchmarks/scripts/sbench_mi50_collection.py +++ b/stencil_benchmarks/scripts/sbench_mi50_collection.py @@ -51,7 +51,6 @@ def main(): gpu_architecture='gfx906', verify=False, dry_runs=1, - gpu_timers=True, alignment=64, dtype='float32') diff --git a/stencil_benchmarks/scripts/sbench_p100_collection.py b/stencil_benchmarks/scripts/sbench_p100_collection.py new file mode 100644 index 0000000..4eb8f45 --- /dev/null +++ b/stencil_benchmarks/scripts/sbench_p100_collection.py @@ -0,0 +1,177 @@ +# Stencil Benchmarks +# +# Copyright (c) 2017-2021, ETH Zurich and MeteoSwiss +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software without +# specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. +# +# SPDX-License-Identifier: BSD-3-Clause + +import click + +from stencil_benchmarks.benchmarks_collection.stencils.cuda_hip import ( + basic, horizontal_diffusion as hdiff, vertical_advection as vadv) +from stencil_benchmarks.tools.multirun import (Configuration, + run_scaling_benchmark, + truncate_block_size_to_domain, + default_kwargs) + + +@click.group() +def main(): + pass + + +common_kwargs = default_kwargs(backend='cuda', + compiler='nvcc', + gpu_architecture='sm_60', + verify=False, + dry_runs=1, + alignment=128, + dtype='float32', + print_code=True) + + +@main.command() +@click.argument('output', type=click.Path()) +@click.option('--executions', '-e', type=int, default=101) +@click.option('--option', '-o', multiple=True) +def basic_bandwidth(output, executions, option): + kwargs = common_kwargs( + option, + loop='3D', + block_size=(128, 2, 1), + halo=(1, 1, 1), + ) + + stream_kwargs = kwargs.copy() + stream_kwargs.update(loop='1D', block_size=(1024, 1, 1), halo=(0, 0, 0)) + + configurations = [ + Configuration(basic.Copy, name='stream', **stream_kwargs), + Configuration(basic.Empty, name='empty', **kwargs), + Configuration(basic.Copy, name='copy', **kwargs), + Configuration(basic.OnesidedAverage, name='avg-i', axis=0, **kwargs), + Configuration(basic.OnesidedAverage, name='avg-j', axis=1, **kwargs), + Configuration(basic.OnesidedAverage, name='avg-k', axis=2, **kwargs), + Configuration(basic.SymmetricAverage, + name='sym-avg-i', + axis=0, + **kwargs), + Configuration(basic.SymmetricAverage, + name='sym-avg-j', + axis=1, + **kwargs), + Configuration(basic.SymmetricAverage, + name='sym-avg-k', + axis=2, + **kwargs), + Configuration(basic.Laplacian, + name='lap-ij', + along_x=True, + along_y=True, + along_z=False, + **kwargs) + ] + + table = run_scaling_benchmark(configurations, executions) + table.to_csv(output) + + +@main.command() +@click.argument('output', type=click.Path()) +@click.option('--executions', '-e', type=int, default=101) +@click.option('--option', '-o', multiple=True) +def horizontal_diffusion_bandwidth(output, executions, option): + kwargs = common_kwargs(option) + + configurations = [ + Configuration(hdiff.Classic, block_size=(32, 16, 1), **kwargs), + Configuration(hdiff.OnTheFly, + block_size=(32, 16, 1), + loop='3D', + **kwargs), + Configuration(hdiff.OnTheFlyIncache, block_size=(32, 8, 1), **kwargs), + Configuration(hdiff.JScanSharedMem, block_size=(256, 32, 1), **kwargs), + Configuration(hdiff.JScanOtfIncache, block_size=(128, 4, 1), **kwargs), + Configuration(hdiff.JScanOtf, block_size=(128, 4, 1), **kwargs), + Configuration(hdiff.JScanShuffleIncache, + block_size=(28, 8, 2), + **kwargs), + Configuration(hdiff.JScanShuffle, block_size=(28, 8, 2), **kwargs), + Configuration(hdiff.JScanShuffleSystolic, + block_size=(28, 4, 3), + **kwargs) + ] + + def truncate_block_size_to_domain_if_possible(**kwargs): + if kwargs['block_size'][0] != 28: + return truncate_block_size_to_domain(**kwargs) + return kwargs + + table = run_scaling_benchmark( + configurations, + executions, + preprocess_args=truncate_block_size_to_domain_if_possible) + table.to_csv(output) + + +@main.command() +@click.argument('output', type=click.Path()) +@click.option('--executions', '-e', type=int, default=101) +@click.option('--option', '-o', multiple=True) +def vertical_advection_bandwidth(output, executions, option): + kwargs = common_kwargs(option) + + configurations = [ + Configuration(vadv.Classic, + block_size=(512, 1), + unroll_factor=8, + **kwargs), + Configuration(vadv.LocalMem, + block_size=(128, 1), + unroll_factor=28, + **kwargs), + Configuration(vadv.SharedMem, + block_size=(64, 1), + unroll_factor=0, + **kwargs), + Configuration(vadv.LocalMemMerged, + block_size=(512, 1), + unroll_factor=2, + **kwargs) + ] + + table = run_scaling_benchmark( + configurations, + executions, + preprocess_args=truncate_block_size_to_domain) + table.to_csv(output) + + +if __name__ == '__main__': + main() diff --git a/stencil_benchmarks/scripts/sbench_v100_collection.py b/stencil_benchmarks/scripts/sbench_v100_collection.py index 39cc700..1973545 100644 --- a/stencil_benchmarks/scripts/sbench_v100_collection.py +++ b/stencil_benchmarks/scripts/sbench_v100_collection.py @@ -51,7 +51,6 @@ def main(): gpu_architecture='sm_70', verify=False, dry_runs=1, - gpu_timers=True, alignment=128, dtype='float32') diff --git a/stencil_benchmarks/tools/compilation.py b/stencil_benchmarks/tools/compilation.py index ed113b7..987d3db 100644 --- a/stencil_benchmarks/tools/compilation.py +++ b/stencil_benchmarks/tools/compilation.py @@ -90,6 +90,7 @@ def _capture_output(stdout: TextIO, stderr: TextIO) -> Iterator[None]: class GnuLibrary: def __init__(self, code: str, + filename: str = 'tmp', compile_command: Optional[List[str]] = None, extension: Optional[str] = None): """Compile and load a C/C++-library. @@ -122,12 +123,16 @@ def __init__(self, compile_command = ['gcc'] if extension.lower() == '.c' else ['g++'] if compile_command[0].endswith('nvcc'): - compile_command += ['-Xcompiler', '-shared', '-Xcompiler', '-fPIC'] + compile_command += ['-Xcompiler', '-shared', '-Xcompiler', '-fPIC', '--generate-line-info'] else: compile_command += ['-shared', '-fPIC'] - with tempfile.NamedTemporaryFile(suffix=extension) as srcfile: - srcfile.write(code.encode()) + output_dir = "benchmarks_source_code" + os.makedirs(output_dir, exist_ok=True) + file_path = os.path.join(output_dir, "{}{}".format(filename, extension)) + + with open(file_path, 'w') as srcfile: + srcfile.write(code) srcfile.flush() with tempfile.NamedTemporaryFile(suffix='.so') as library: