Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
db2de8e
Fix regex filtering when creating plots
iomaganaris Feb 6, 2024
513c9f1
Drop deprecated gpu_timers argument
iomaganaris Feb 6, 2024
cfcf9cf
Added script for H100
iomaganaris Feb 6, 2024
142820b
Enabled code printing
iomaganaris Feb 7, 2024
cfeb794
Trying to give informative names to the kernels
iomaganaris Feb 7, 2024
790e1de
Write the actual file with the code
iomaganaris Feb 7, 2024
f03caf9
Open file for writing
iomaganaris Feb 7, 2024
799e3e3
Write str instead of bytecode
iomaganaris Feb 7, 2024
d2ccc06
Print compilation command
iomaganaris Feb 7, 2024
c3938ef
Make sure that content is flushed to file
iomaganaris Feb 7, 2024
56bbcf4
Fix gpu_kernel name in vertical advection kernels
iomaganaris Feb 7, 2024
6d4828b
Added P100 script
iomaganaris Feb 8, 2024
b9f211b
Small improvement in printing source code file of generated kernels
iomaganaris Feb 8, 2024
28740ef
Revert changes in A100 script
iomaganaris Feb 8, 2024
3e94cc7
Merge remote-tracking branch 'my-fork/ioannmag/plot_fixes' into ioann…
iomaganaris Feb 8, 2024
7309bff
Fixed extension
iomaganaris Feb 8, 2024
ca10adf
Remove debug print
iomaganaris Feb 8, 2024
a399322
Reorder gpu benchmark scripts
iomaganaris Feb 8, 2024
ef15ac9
Trying better block size for vec adv classic
iomaganaris Feb 13, 2024
cf6044c
Trying better config for vertical advection in H100
iomaganaris Feb 13, 2024
6b1512e
Revert changes in the sharedmem implementation of veradv
iomaganaris Feb 15, 2024
394292a
Update hor dif configuration for H100
iomaganaris Feb 15, 2024
5731e9e
Add line info for use with ncu and --import-source yes
iomaganaris Feb 15, 2024
b7c0a5e
Merge branch 'ioannmag/debug_tests' into ioannmag/gpu_benchmarks
iomaganaris Feb 15, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,10 +93,14 @@ def pybind11_extension(m):
'console_scripts': [
'sbench=stencil_benchmarks.scripts.sbench:main',
'sbench-analyze=stencil_benchmarks.scripts.sbench_analyze:main',
'sbench-h100-collection=stencil_benchmarks.scripts'
'.sbench_h100_collection:main',
'sbench-a100-collection=stencil_benchmarks.scripts'
'.sbench_a100_collection:main',
'sbench-v100-collection=stencil_benchmarks.scripts'
'.sbench_v100_collection:main',
'sbench-p100-collection=stencil_benchmarks.scripts'
'.sbench_p100_collection:main',
'sbench-mi50-collection=stencil_benchmarks.scripts'
'.sbench_mi50_collection:main',
'sbench-mi100-collection=stencil_benchmarks.scripts'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -74,8 +74,10 @@ def setup(self):
self.compiler_flags = (self.default_compiler_flags() + ' ' +
self.compiler_flags).strip()

filename = self.template_file().partition('.')[0]

try:
self.compiled = compilation.GnuLibrary(code, [self.compiler] +
self.compiled = compilation.GnuLibrary(code, filename, [self.compiler] +
self.compiler_flags.split())
except compilation.CompilationError as error:
raise ParameterError(*error.args) from error
Expand Down Expand Up @@ -114,7 +116,8 @@ def template_args(self):
dry_runs=self.dry_runs,
timers=self.timers,
strides=self.strides,
index_type=self.index_type)
index_type=self.index_type,
implementation_name=self.template_file().partition('.')[0])

@contextlib.contextmanager
def on_device(self, data):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ using hipDeviceProp = hipDeviceProp_t;
}

{% block gpu_kernel %}
__global__ void gpu_kernel(
__global__ void gpu_kernel_{{ implementation_name }}(
{%- for arg in args %}
{{ ctype }} * __restrict__ {{ arg }}{{ "," if not loop.last }}
{%- endfor %}
Expand Down Expand Up @@ -135,7 +135,7 @@ extern "C" int kernel(
}

for (int dry_run = 0; dry_run < {{ dry_runs }}; ++dry_run) {
gpu_kernel<<<grid_size, block_size, smem_size>>>(
gpu_kernel_{{ implementation_name }}<<<grid_size, block_size, smem_size>>>(
{%- for arg in args %}
{{ arg }}{{ "," if not loop.last }}
{%- endfor %}
Expand All @@ -158,9 +158,9 @@ extern "C" int kernel(
{%- endif %}

{%- if timers == 'hip-ext' %}
hipExtLaunchKernelGGL(gpu_kernel, grid_size, block_size, smem_size, 0, start, stop, 0,
hipExtLaunchKernelGGL(gpu_kernel_{{ implementation_name }}, grid_size, block_size, smem_size, 0, start, stop, 0,
{%- else %}
gpu_kernel<<<grid_size, block_size, smem_size>>>(
gpu_kernel_{{ implementation_name }}<<<grid_size, block_size, smem_size>>>(
{%- endif %}
{%- for arg in args %}
{{ arg }}{{ "," if not loop.last }}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ __forceinline__ __device__ void forward_sweep(const {{ index_type }} ishift,
}


__global__ void __launch_bounds__({{ block_size[0] * block_size[1] }}) gpu_kernel(
__global__ void __launch_bounds__({{ block_size[0] * block_size[1] }}) gpu_kernel_{{ implementation_name }}(
const {{ ctype }} *__restrict__ ustage,
const {{ ctype }} *__restrict__ upos,
const {{ ctype }} *__restrict__ utens,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ __forceinline__ __device__ void forward_sweep(const {{ index_type }} ishift,
}


__global__ void __launch_bounds__({{ block_size[0] * block_size[1] }}) gpu_kernel(
__global__ void __launch_bounds__({{ block_size[0] * block_size[1] }}) gpu_kernel_{{ implementation_name }}(
const {{ ctype }} *__restrict__ ustage,
const {{ ctype }} *__restrict__ upos,
const {{ ctype }} *__restrict__ utens,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -389,7 +389,7 @@ __forceinline__ __device__ void forward_sweep(const {{ ctype }} *__restrict__ wc
}


__global__ void __launch_bounds__({{ block_size[0] * block_size[1] }}) gpu_kernel(
__global__ void __launch_bounds__({{ block_size[0] * block_size[1] }}) gpu_kernel_{{ implementation_name }}(
const {{ ctype }} *__restrict__ ustage,
const {{ ctype }} *__restrict__ upos,
const {{ ctype }} *__restrict__ utens,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,7 @@ __forceinline__ __device__ void forward_sweep(const {{ index_type }} ishift,
}


__global__ void __launch_bounds__({{ block_size[0] * block_size[1] }}) gpu_kernel(
__global__ void __launch_bounds__({{ block_size[0] * block_size[1] }}) gpu_kernel_{{ implementation_name }}(
const {{ ctype }} *__restrict__ ustage,
const {{ ctype }} *__restrict__ upos,
const {{ ctype }} *__restrict__ utens,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,9 @@ def setup(self):
if self.compiler.endswith('icpc'):
os.environ['KMP_INIT_AT_FORK'] = '0'

self.compiled = compilation.GnuLibrary(code, self.compile_command())
filename = self.template_file().partition('.')[0]

self.compiled = compilation.GnuLibrary(code, filename, self.compile_command())

if self.verify and self.dry_runs:
warnings.warn(
Expand Down
2 changes: 2 additions & 0 deletions stencil_benchmarks/benchmarks_collection/stream/cuda_hip.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,9 @@ def setup(self):
code = template.render(template_file, **self.template_args())
if self.print_code:
print(cpphelpers.format_code(code))
filename = 'cuda_hip'
self.compiled = compilation.GnuLibrary(code,
filename,
self.compile_command(),
extension='.cu')

Expand Down
3 changes: 2 additions & 1 deletion stencil_benchmarks/benchmarks_collection/stream/mc_calpin.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,8 +67,9 @@ def setup(self):
r'/\* [a-z ]*"tuned" versions of the kernels \*/(.*)',
cpphelpers.format_code(code),
re.MULTILINE | re.DOTALL).group(1))

filename = self.template_file().partition('.')[0]
self.compiled = compilation.GnuLibrary(code,
filename,
self.compile_command(),
extension='.c')

Expand Down
1 change: 0 additions & 1 deletion stencil_benchmarks/scripts/sbench_a100_collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,6 @@ def main():
gpu_architecture='sm_80',
verify=False,
dry_runs=1,
gpu_timers=True,
alignment=128,
dtype='float32')

Expand Down
6 changes: 3 additions & 3 deletions stencil_benchmarks/scripts/sbench_analyze.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,10 +191,10 @@ def plot(csv, uniform, ylim, title, auto_group, group, select, filter,

regexes = []
for regex in label_regex:
splitter = label_regex[0]
if label_regex[-1] != splitter:
splitter = regex[0]
if regex[-1] != splitter:
raise ValueError('expected input in the form /pattern/repl/')
pattern, repl = label_regex[1:-1].split(splitter, 1)
pattern, repl = regex[1:-1].split(splitter, 1)
regexes.append((re.compile(pattern), repl))

for index, row in df.iterrows():
Expand Down
176 changes: 176 additions & 0 deletions stencil_benchmarks/scripts/sbench_h100_collection.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,176 @@
# Stencil Benchmarks
#
# Copyright (c) 2017-2021, ETH Zurich and MeteoSwiss
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its contributors
# may be used to endorse or promote products derived from this software without
# specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
#
# SPDX-License-Identifier: BSD-3-Clause

import click

from stencil_benchmarks.benchmarks_collection.stencils.cuda_hip import (
basic, horizontal_diffusion as hdiff, vertical_advection as vadv)
from stencil_benchmarks.tools.multirun import (Configuration,
run_scaling_benchmark,
truncate_block_size_to_domain,
default_kwargs)


@click.group()
def main():
pass


common_kwargs = default_kwargs(backend='cuda',
compiler='nvcc',
gpu_architecture='sm_90',
verify=False,
dry_runs=1,
alignment=128,
dtype='float32')


@main.command()
@click.argument('output', type=click.Path())
@click.option('--executions', '-e', type=int, default=101)
@click.option('--option', '-o', multiple=True)
def basic_bandwidth(output, executions, option):
kwargs = common_kwargs(
option,
loop='3D',
block_size=(128, 2, 1),
halo=(1, 1, 1),
)

stream_kwargs = kwargs.copy()
stream_kwargs.update(loop='1D', block_size=(1024, 1, 1), halo=(0, 0, 0))

configurations = [
Configuration(basic.Copy, name='stream', **stream_kwargs),
Configuration(basic.Empty, name='empty', **kwargs),
Configuration(basic.Copy, name='copy', **kwargs),
Configuration(basic.OnesidedAverage, name='avg-i', axis=0, **kwargs),
Configuration(basic.OnesidedAverage, name='avg-j', axis=1, **kwargs),
Configuration(basic.OnesidedAverage, name='avg-k', axis=2, **kwargs),
Configuration(basic.SymmetricAverage,
name='sym-avg-i',
axis=0,
**kwargs),
Configuration(basic.SymmetricAverage,
name='sym-avg-j',
axis=1,
**kwargs),
Configuration(basic.SymmetricAverage,
name='sym-avg-k',
axis=2,
**kwargs),
Configuration(basic.Laplacian,
name='lap-ij',
along_x=True,
along_y=True,
along_z=False,
**kwargs)
]

table = run_scaling_benchmark(configurations, executions)
table.to_csv(output)


@main.command()
@click.argument('output', type=click.Path())
@click.option('--executions', '-e', type=int, default=101)
@click.option('--option', '-o', multiple=True)
def horizontal_diffusion_bandwidth(output, executions, option):
kwargs = common_kwargs(option)

configurations = [
Configuration(hdiff.Classic, block_size=(32, 12, 1), **kwargs),
Configuration(hdiff.OnTheFly,
block_size=(32, 16, 1),
loop='3D',
**kwargs),
Configuration(hdiff.OnTheFlyIncache, block_size=(32, 8, 1), **kwargs),
Configuration(hdiff.JScanSharedMem, block_size=(256, 32, 1), **kwargs),
Configuration(hdiff.JScanOtfIncache, block_size=(128, 4, 1), **kwargs),
Configuration(hdiff.JScanOtf, block_size=(128, 4, 1), **kwargs),
Configuration(hdiff.JScanShuffleIncache,
block_size=(28, 8, 2),
**kwargs),
Configuration(hdiff.JScanShuffle, block_size=(28, 8, 2), **kwargs),
Configuration(hdiff.JScanShuffleSystolic,
block_size=(28, 4, 3),
**kwargs)
]

def truncate_block_size_to_domain_if_possible(**kwargs):
if kwargs['block_size'][0] != 28:
return truncate_block_size_to_domain(**kwargs)
return kwargs

table = run_scaling_benchmark(
configurations,
executions,
preprocess_args=truncate_block_size_to_domain_if_possible)
table.to_csv(output)


@main.command()
@click.argument('output', type=click.Path())
@click.option('--executions', '-e', type=int, default=101)
@click.option('--option', '-o', multiple=True)
def vertical_advection_bandwidth(output, executions, option):
kwargs = common_kwargs(option)

configurations = [
Configuration(vadv.Classic,
block_size=(128, 1),
unroll_factor=8,
**kwargs),
Configuration(vadv.LocalMem,
block_size=(128, 1),
unroll_factor=28,
**kwargs),
Configuration(vadv.SharedMem,
block_size=(64, 1),
unroll_factor=0,
**kwargs),
Configuration(vadv.LocalMemMerged,
block_size=(128, 1),
unroll_factor=2,
**kwargs)
]

table = run_scaling_benchmark(
configurations,
executions,
preprocess_args=truncate_block_size_to_domain)
table.to_csv(output)


if __name__ == '__main__':
main()
1 change: 0 additions & 1 deletion stencil_benchmarks/scripts/sbench_mi100_collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,6 @@ def main():
gpu_architecture='gfx908',
verify=False,
dry_runs=100,
gpu_timers=True,
alignment=512,
dtype='float32')

Expand Down
1 change: 0 additions & 1 deletion stencil_benchmarks/scripts/sbench_mi50_collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,6 @@ def main():
gpu_architecture='gfx906',
verify=False,
dry_runs=1,
gpu_timers=True,
alignment=64,
dtype='float32')

Expand Down
Loading